]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
Merge branch 'kvm-tdx-initial' into HEAD
authorPaolo Bonzini <pbonzini@redhat.com>
Wed, 19 Mar 2025 13:46:59 +0000 (09:46 -0400)
committerPaolo Bonzini <pbonzini@redhat.com>
Mon, 7 Apr 2025 11:36:33 +0000 (07:36 -0400)
This large commit contains the initial support for TDX in KVM.  All x86
parts enable the host-side hypercalls that KVM uses to talk to the TDX
module, a software component that runs in a special CPU mode called SEAM
(Secure Arbitration Mode).

The series is in turn split into multiple sub-series, each with a separate
merge commit:

- Initialization: basic setup for using the TDX module from KVM, plus
  ioctls to create TDX VMs and vCPUs.

- MMU: in TDX, private and shared halves of the address space are mapped by
  different EPT roots, and the private half is managed by the TDX module.
  Using the support that was added to the generic MMU code in 6.14,
  add support for TDX's secure page tables to the Intel side of KVM.
  Generic KVM code takes care of maintaining a mirror of the secure page
  tables so that they can be queried efficiently, and ensuring that changes
  are applied to both the mirror and the secure EPT.

- vCPU enter/exit: implement the callbacks that handle the entry of a TDX
  vCPU (via the SEAMCALL TDH.VP.ENTER) and the corresponding save/restore
  of host state.

- Userspace exits: introduce support for guest TDVMCALLs that KVM forwards to
  userspace.  These correspond to the usual KVM_EXIT_* "heavyweight vmexits"
  but are triggered through a different mechanism, similar to VMGEXIT for
  SEV-ES and SEV-SNP.

- Interrupt handling: support for virtual interrupt injection as well as
  handling VM-Exits that are caused by vectored events.  Exclusive to
  TDX are machine-check SMIs, which the kernel already knows how to
  handle through the kernel machine check handler (commit 7911f145de5f,
  "x86/mce: Implement recovery for errors in TDX/SEAM non-root mode")

- Loose ends: handling of the remaining exits from the TDX module, including
  EPT violation/misconfig and several TDVMCALL leaves that are handled in
  the kernel (CPUID, HLT, RDMSR/WRMSR, GetTdVmCallInfo); plus returning
  an error or ignoring operations that are not supported by TDX guests

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
27 files changed:
1  2 
Documentation/virt/kvm/api.rst
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/shared/tdx.h
arch/x86/include/asm/tdx.h
arch/x86/include/asm/vmx.h
arch/x86/include/uapi/asm/kvm.h
arch/x86/kvm/Kconfig
arch/x86/kvm/cpuid.c
arch/x86/kvm/cpuid.h
arch/x86/kvm/lapic.c
arch/x86/kvm/mmu/mmu.c
arch/x86/kvm/mmu/spte.c
arch/x86/kvm/mmu/spte.h
arch/x86/kvm/mmu/tdp_mmu.c
arch/x86/kvm/svm/svm.c
arch/x86/kvm/vmx/common.h
arch/x86/kvm/vmx/nested.c
arch/x86/kvm/vmx/posted_intr.c
arch/x86/kvm/vmx/tdx.c
arch/x86/kvm/vmx/vmx.c
arch/x86/kvm/x86.c
arch/x86/kvm/x86.h
include/linux/kvm_host.h
include/linux/misc_cgroup.h
include/uapi/linux/kvm.h
kernel/cgroup/misc.c
virt/kvm/kvm_main.c

Simple merge
Simple merge
Simple merge
index 4a1922ec80cf76ffb9662d0dd34eb02a7c66b4d4,6d74f4d8c573140bf38f4ec7d6eb12c799d1fc5d..26ffc792e6734456f7965918a998f7c7e9ade646
  #define TDX_SUCCESS           0ULL
  #define TDX_RND_NO_ENTROPY    0x8000020300000000ULL
  
 -#ifndef __ASSEMBLY__
 +#ifndef __ASSEMBLER__
  
  #include <uapi/asm/mce.h>
+ #include <asm/tdx_global_metadata.h>
+ #include <linux/pgtable.h>
  
  /*
   * Used by the #VE exception handler to gather the #VE exception
@@@ -123,8 -196,10 +196,10 @@@ u64 tdh_phymem_page_wbinvd_hkid(u64 hki
  static inline void tdx_init(void) { }
  static inline int tdx_cpu_enable(void) { return -ENODEV; }
  static inline int tdx_enable(void)  { return -ENODEV; }
+ static inline u32 tdx_get_nr_guest_keyids(void) { return 0; }
  static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; }
+ static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; }
  #endif        /* CONFIG_INTEL_TDX_HOST */
  
 -#endif /* !__ASSEMBLY__ */
 +#endif /* !__ASSEMBLER__ */
  #endif /* _ASM_X86_TDX_H */
index 8707361b24daa0c69703d4a688b1556a709296b8,028f3b8db2afe94a819157ce239ef463aeea1866..cca7d664128752e2fd9175d0c4b9357f84dd3a84
@@@ -580,22 -581,20 +581,23 @@@ enum vm_entry_failure_code 
  /*
   * Exit Qualifications for EPT Violations
   */
 -#define EPT_VIOLATION_ACC_READ_BIT    0
 -#define EPT_VIOLATION_ACC_WRITE_BIT   1
 -#define EPT_VIOLATION_ACC_INSTR_BIT   2
 -#define EPT_VIOLATION_RWX_SHIFT               3
 -#define EPT_VIOLATION_EXEC_R3_LIN_BIT 6
 -#define EPT_VIOLATION_GVA_IS_VALID_BIT        7
 -#define EPT_VIOLATION_GVA_TRANSLATED_BIT 8
 -#define EPT_VIOLATION_ACC_READ                (1 << EPT_VIOLATION_ACC_READ_BIT)
 -#define EPT_VIOLATION_ACC_WRITE               (1 << EPT_VIOLATION_ACC_WRITE_BIT)
 -#define EPT_VIOLATION_ACC_INSTR               (1 << EPT_VIOLATION_ACC_INSTR_BIT)
 -#define EPT_VIOLATION_RWX_MASK                (VMX_EPT_RWX_MASK << EPT_VIOLATION_RWX_SHIFT)
 -#define EPT_VIOLATION_EXEC_FOR_RING3_LIN (1 << EPT_VIOLATION_EXEC_R3_LIN_BIT)
 -#define EPT_VIOLATION_GVA_IS_VALID    (1 << EPT_VIOLATION_GVA_IS_VALID_BIT)
 -#define EPT_VIOLATION_GVA_TRANSLATED  (1 << EPT_VIOLATION_GVA_TRANSLATED_BIT)
 +#define EPT_VIOLATION_ACC_READ                BIT(0)
 +#define EPT_VIOLATION_ACC_WRITE               BIT(1)
 +#define EPT_VIOLATION_ACC_INSTR               BIT(2)
 +#define EPT_VIOLATION_PROT_READ               BIT(3)
 +#define EPT_VIOLATION_PROT_WRITE      BIT(4)
 +#define EPT_VIOLATION_PROT_EXEC               BIT(5)
++#define EPT_VIOLATION_EXEC_FOR_RING3_LIN BIT(6)
 +#define EPT_VIOLATION_PROT_MASK               (EPT_VIOLATION_PROT_READ  | \
 +                                       EPT_VIOLATION_PROT_WRITE | \
 +                                       EPT_VIOLATION_PROT_EXEC)
 +#define EPT_VIOLATION_GVA_IS_VALID    BIT(7)
 +#define EPT_VIOLATION_GVA_TRANSLATED  BIT(8)
 +
 +#define EPT_VIOLATION_RWX_TO_PROT(__epte) (((__epte) & VMX_EPT_RWX_MASK) << 3)
 +
 +static_assert(EPT_VIOLATION_RWX_TO_PROT(VMX_EPT_RWX_MASK) ==
 +            (EPT_VIOLATION_PROT_READ | EPT_VIOLATION_PROT_WRITE | EPT_VIOLATION_PROT_EXEC));
  
  /*
   * Exit Qualifications for NOTIFY VM EXIT
Simple merge
Simple merge
Simple merge
index d2884162a46a66994e6d82a61f85973910181749,5c174421a4d977349c86dbfec483808b3d82744c..d3f5ae15a7ca314675ad0b4e7347ddd2c205a69f
@@@ -11,10 -11,35 +11,34 @@@ extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS
  void kvm_set_cpu_caps(void);
  
  void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu);
- struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu,
-                                                   u32 function, u32 index);
- struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
-                                             u32 function);
 -void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu);
+ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry2(struct kvm_cpuid_entry2 *entries,
+                                              int nent, u32 function, u64 index);
+ /*
+  * Magic value used by KVM when querying userspace-provided CPUID entries and
+  * doesn't care about the CPIUD index because the index of the function in
+  * question is not significant.  Note, this magic value must have at least one
+  * bit set in bits[63:32] and must be consumed as a u64 by kvm_find_cpuid_entry2()
+  * to avoid false positives when processing guest CPUID input.
+  *
+  * KVM_CPUID_INDEX_NOT_SIGNIFICANT should never be used directly outside of
+  * kvm_find_cpuid_entry2() and kvm_find_cpuid_entry().
+  */
+ #define KVM_CPUID_INDEX_NOT_SIGNIFICANT -1ull
+ static inline struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu,
+                                                                 u32 function, u32 index)
+ {
+       return kvm_find_cpuid_entry2(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent,
+                                    function, index);
+ }
+ static inline struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+                                                           u32 function)
+ {
+       return kvm_find_cpuid_entry2(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent,
+                                    function, KVM_CPUID_INDEX_NOT_SIGNIFICANT);
+ }
  int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
                            struct kvm_cpuid_entry2 __user *entries,
                            unsigned int type);
Simple merge
Simple merge
Simple merge
Simple merge
index 21a3b816624239c9bd7c29ab943c7f8697da3468,fd0a7792386b88fcc2a4bec2220b8a5277f4d515..405874f4d08803b1de724c2d34a16e429ad33a43
@@@ -1919,9 -1901,7 +1918,7 @@@ static int __kvm_tdp_mmu_get_walk(struc
        gfn_t gfn = addr >> PAGE_SHIFT;
        int leaf = -1;
  
-       *root_level = vcpu->arch.mmu->root_role.level;
 -      tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
 +      for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
                leaf = iter.level;
                sptes[leaf] = iter.old_spte;
        }
Simple merge
index 0000000000000000000000000000000000000000,8fe6a7fa1c68b72a12536125de1a1a359e76906c..8f46a06e2c44147aa8f6ee501f1f32a6ff0eac09
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,182 +1,182 @@@
 -      error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK)
+ /* SPDX-License-Identifier: GPL-2.0-only */
+ #ifndef __KVM_X86_VMX_COMMON_H
+ #define __KVM_X86_VMX_COMMON_H
+ #include <linux/kvm_host.h>
+ #include <asm/posted_intr.h>
+ #include "mmu.h"
+ union vmx_exit_reason {
+       struct {
+               u32     basic                   : 16;
+               u32     reserved16              : 1;
+               u32     reserved17              : 1;
+               u32     reserved18              : 1;
+               u32     reserved19              : 1;
+               u32     reserved20              : 1;
+               u32     reserved21              : 1;
+               u32     reserved22              : 1;
+               u32     reserved23              : 1;
+               u32     reserved24              : 1;
+               u32     reserved25              : 1;
+               u32     bus_lock_detected       : 1;
+               u32     enclave_mode            : 1;
+               u32     smi_pending_mtf         : 1;
+               u32     smi_from_vmx_root       : 1;
+               u32     reserved30              : 1;
+               u32     failed_vmentry          : 1;
+       };
+       u32 full;
+ };
+ struct vcpu_vt {
+       /* Posted interrupt descriptor */
+       struct pi_desc pi_desc;
+       /* Used if this vCPU is waiting for PI notification wakeup. */
+       struct list_head pi_wakeup_list;
+       union vmx_exit_reason exit_reason;
+       unsigned long   exit_qualification;
+       u32             exit_intr_info;
+       /*
+        * If true, guest state has been loaded into hardware, and host state
+        * saved into vcpu_{vt,vmx,tdx}.  If false, host state is loaded into
+        * hardware.
+        */
+       bool            guest_state_loaded;
+       bool            emulation_required;
+ #ifdef CONFIG_X86_64
+       u64             msr_host_kernel_gs_base;
+ #endif
+       unsigned long   host_debugctlmsr;
+ };
+ #ifdef CONFIG_KVM_INTEL_TDX
+ static __always_inline bool is_td(struct kvm *kvm)
+ {
+       return kvm->arch.vm_type == KVM_X86_TDX_VM;
+ }
+ static __always_inline bool is_td_vcpu(struct kvm_vcpu *vcpu)
+ {
+       return is_td(vcpu->kvm);
+ }
+ #else
+ static inline bool is_td(struct kvm *kvm) { return false; }
+ static inline bool is_td_vcpu(struct kvm_vcpu *vcpu) { return false; }
+ #endif
+ static inline bool vt_is_tdx_private_gpa(struct kvm *kvm, gpa_t gpa)
+ {
+       /* For TDX the direct mask is the shared mask. */
+       return !kvm_is_addr_direct(kvm, gpa);
+ }
+ static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa,
+                                            unsigned long exit_qualification)
+ {
+       u64 error_code;
+       /* Is it a read fault? */
+       error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
+                    ? PFERR_USER_MASK : 0;
+       /* Is it a write fault? */
+       error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
+                     ? PFERR_WRITE_MASK : 0;
+       /* Is it a fetch fault? */
+       error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
+                     ? PFERR_FETCH_MASK : 0;
+       /* ept page table entry is present? */
++      error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK)
+                     ? PFERR_PRESENT_MASK : 0;
+       if (error_code & EPT_VIOLATION_GVA_IS_VALID)
+               error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ?
+                             PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
+       if (vt_is_tdx_private_gpa(vcpu->kvm, gpa))
+               error_code |= PFERR_PRIVATE_ACCESS;
+       return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
+ }
+ static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
+                                                    int pi_vec)
+ {
+ #ifdef CONFIG_SMP
+       if (vcpu->mode == IN_GUEST_MODE) {
+               /*
+                * The vector of the virtual has already been set in the PIR.
+                * Send a notification event to deliver the virtual interrupt
+                * unless the vCPU is the currently running vCPU, i.e. the
+                * event is being sent from a fastpath VM-Exit handler, in
+                * which case the PIR will be synced to the vIRR before
+                * re-entering the guest.
+                *
+                * When the target is not the running vCPU, the following
+                * possibilities emerge:
+                *
+                * Case 1: vCPU stays in non-root mode. Sending a notification
+                * event posts the interrupt to the vCPU.
+                *
+                * Case 2: vCPU exits to root mode and is still runnable. The
+                * PIR will be synced to the vIRR before re-entering the guest.
+                * Sending a notification event is ok as the host IRQ handler
+                * will ignore the spurious event.
+                *
+                * Case 3: vCPU exits to root mode and is blocked. vcpu_block()
+                * has already synced PIR to vIRR and never blocks the vCPU if
+                * the vIRR is not empty. Therefore, a blocked vCPU here does
+                * not wait for any requested interrupts in PIR, and sending a
+                * notification event also results in a benign, spurious event.
+                */
+               if (vcpu != kvm_get_running_vcpu())
+                       __apic_send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
+               return;
+       }
+ #endif
+       /*
+        * The vCPU isn't in the guest; wake the vCPU in case it is blocking,
+        * otherwise do nothing as KVM will grab the highest priority pending
+        * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest().
+        */
+       kvm_vcpu_wake_up(vcpu);
+ }
+ /*
+  * Post an interrupt to a vCPU's PIR and trigger the vCPU to process the
+  * interrupt if necessary.
+  */
+ static inline void __vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu,
+                                                 struct pi_desc *pi_desc, int vector)
+ {
+       if (pi_test_and_set_pir(vector, pi_desc))
+               return;
+       /* If a previous notification has sent the IPI, nothing to do.  */
+       if (pi_test_and_set_on(pi_desc))
+               return;
+       /*
+        * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*()
+        * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is
+        * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
+        * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
+        */
+       kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR);
+ }
+ noinstr void vmx_handle_nmi(struct kvm_vcpu *vcpu);
+ #endif /* __KVM_X86_VMX_COMMON_H */
Simple merge
index 51116fe69a5001b91d95bf8c3417533b1e2c55e0,f2ca37b3f606e0129745e4b2c2e551d673d2c8a1..73028957cc66475593c487e6c5c1c1d2177f6763
@@@ -31,11 -32,9 +32,11 @@@ static DEFINE_PER_CPU(struct list_head
   */
  static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock);
  
- static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
 +#define PI_LOCK_SCHED_OUT SINGLE_DEPTH_NESTING
 +
+ struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
  {
-       return &(to_vmx(vcpu)->pi_desc);
+       return &(to_vt(vcpu)->pi_desc);
  }
  
  static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new)
@@@ -91,20 -90,9 +92,20 @@@ void vmx_vcpu_pi_load(struct kvm_vcpu *
         * current pCPU if the task was migrated.
         */
        if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) {
 -              raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
 +              raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu);
 +
 +              /*
 +               * In addition to taking the wakeup lock for the regular/IRQ
 +               * context, tell lockdep it is being taken for the "sched out"
 +               * context as well.  vCPU loads happens in task context, and
 +               * this is taking the lock of the *previous* CPU, i.e. can race
 +               * with both the scheduler and the wakeup handler.
 +               */
 +              raw_spin_lock(spinlock);
 +              spin_acquire(&spinlock->dep_map, PI_LOCK_SCHED_OUT, 0, _RET_IP_);
-               list_del(&vmx->pi_wakeup_list);
+               list_del(&vt->pi_wakeup_list);
 -              raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
 +              spin_release(&spinlock->dep_map, _RET_IP_);
 +              raw_spin_unlock(spinlock);
        }
  
        dest = cpu_physical_id(cpu);
@@@ -159,26 -147,14 +160,26 @@@ static bool vmx_can_use_vtd_pi(struct k
  static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
  {
        struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       struct vcpu_vt *vt = to_vt(vcpu);
        struct pi_desc old, new;
 -      unsigned long flags;
  
 -      local_irq_save(flags);
 +      lockdep_assert_irqs_disabled();
  
 -      raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
 +      /*
 +       * Acquire the wakeup lock using the "sched out" context to workaround
 +       * a lockdep false positive.  When this is called, schedule() holds
 +       * various per-CPU scheduler locks.  When the wakeup handler runs, it
 +       * holds this CPU's wakeup lock while calling try_to_wake_up(), which
 +       * can eventually take the aforementioned scheduler locks, which causes
 +       * lockdep to assume there is deadlock.
 +       *
 +       * Deadlock can't actually occur because IRQs are disabled for the
 +       * entirety of the sched_out critical section, i.e. the wakeup handler
 +       * can't run while the scheduler locks are held.
 +       */
 +      raw_spin_lock_nested(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu),
 +                           PI_LOCK_SCHED_OUT);
-       list_add_tail(&vmx->pi_wakeup_list,
+       list_add_tail(&vt->pi_wakeup_list,
                      &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu));
        raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
  
index 0000000000000000000000000000000000000000,fb4f9aa84b959741e5673c835728977cc6a539a6..b952bc6732713910cd0de6bb14144fba84f3d099
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,3526 +1,3526 @@@
 -      return !(eq & EPT_VIOLATION_RWX_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN);
+ // SPDX-License-Identifier: GPL-2.0
+ #include <linux/cleanup.h>
+ #include <linux/cpu.h>
+ #include <asm/cpufeature.h>
+ #include <asm/fpu/xcr.h>
+ #include <linux/misc_cgroup.h>
+ #include <linux/mmu_context.h>
+ #include <asm/tdx.h>
+ #include "capabilities.h"
+ #include "mmu.h"
+ #include "x86_ops.h"
+ #include "lapic.h"
+ #include "tdx.h"
+ #include "vmx.h"
+ #include "mmu/spte.h"
+ #include "common.h"
+ #include "posted_intr.h"
+ #include "irq.h"
+ #include <trace/events/kvm.h>
+ #include "trace.h"
+ #pragma GCC poison to_vmx
+ #undef pr_fmt
+ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+ #define pr_tdx_error(__fn, __err)     \
+       pr_err_ratelimited("SEAMCALL %s failed: 0x%llx\n", #__fn, __err)
+ #define __pr_tdx_error_N(__fn_str, __err, __fmt, ...)         \
+       pr_err_ratelimited("SEAMCALL " __fn_str " failed: 0x%llx, " __fmt,  __err,  __VA_ARGS__)
+ #define pr_tdx_error_1(__fn, __err, __rcx)            \
+       __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx\n", __rcx)
+ #define pr_tdx_error_2(__fn, __err, __rcx, __rdx)     \
+       __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx\n", __rcx, __rdx)
+ #define pr_tdx_error_3(__fn, __err, __rcx, __rdx, __r8)       \
+       __pr_tdx_error_N(#__fn, __err, "rcx 0x%llx, rdx 0x%llx, r8 0x%llx\n", __rcx, __rdx, __r8)
+ bool enable_tdx __ro_after_init;
+ module_param_named(tdx, enable_tdx, bool, 0444);
+ #define TDX_SHARED_BIT_PWL_5 gpa_to_gfn(BIT_ULL(51))
+ #define TDX_SHARED_BIT_PWL_4 gpa_to_gfn(BIT_ULL(47))
+ static enum cpuhp_state tdx_cpuhp_state;
+ static const struct tdx_sys_info *tdx_sysinfo;
+ void tdh_vp_rd_failed(struct vcpu_tdx *tdx, char *uclass, u32 field, u64 err)
+ {
+       KVM_BUG_ON(1, tdx->vcpu.kvm);
+       pr_err("TDH_VP_RD[%s.0x%x] failed 0x%llx\n", uclass, field, err);
+ }
+ void tdh_vp_wr_failed(struct vcpu_tdx *tdx, char *uclass, char *op, u32 field,
+                     u64 val, u64 err)
+ {
+       KVM_BUG_ON(1, tdx->vcpu.kvm);
+       pr_err("TDH_VP_WR[%s.0x%x]%s0x%llx failed: 0x%llx\n", uclass, field, op, val, err);
+ }
+ #define KVM_SUPPORTED_TD_ATTRS (TDX_TD_ATTR_SEPT_VE_DISABLE)
+ static __always_inline struct kvm_tdx *to_kvm_tdx(struct kvm *kvm)
+ {
+       return container_of(kvm, struct kvm_tdx, kvm);
+ }
+ static __always_inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu)
+ {
+       return container_of(vcpu, struct vcpu_tdx, vcpu);
+ }
+ static u64 tdx_get_supported_attrs(const struct tdx_sys_info_td_conf *td_conf)
+ {
+       u64 val = KVM_SUPPORTED_TD_ATTRS;
+       if ((val & td_conf->attributes_fixed1) != td_conf->attributes_fixed1)
+               return 0;
+       val &= td_conf->attributes_fixed0;
+       return val;
+ }
+ static u64 tdx_get_supported_xfam(const struct tdx_sys_info_td_conf *td_conf)
+ {
+       u64 val = kvm_caps.supported_xcr0 | kvm_caps.supported_xss;
+       if ((val & td_conf->xfam_fixed1) != td_conf->xfam_fixed1)
+               return 0;
+       val &= td_conf->xfam_fixed0;
+       return val;
+ }
+ static int tdx_get_guest_phys_addr_bits(const u32 eax)
+ {
+       return (eax & GENMASK(23, 16)) >> 16;
+ }
+ static u32 tdx_set_guest_phys_addr_bits(const u32 eax, int addr_bits)
+ {
+       return (eax & ~GENMASK(23, 16)) | (addr_bits & 0xff) << 16;
+ }
+ #define TDX_FEATURE_TSX (__feature_bit(X86_FEATURE_HLE) | __feature_bit(X86_FEATURE_RTM))
+ static bool has_tsx(const struct kvm_cpuid_entry2 *entry)
+ {
+       return entry->function == 7 && entry->index == 0 &&
+              (entry->ebx & TDX_FEATURE_TSX);
+ }
+ static void clear_tsx(struct kvm_cpuid_entry2 *entry)
+ {
+       entry->ebx &= ~TDX_FEATURE_TSX;
+ }
+ static bool has_waitpkg(const struct kvm_cpuid_entry2 *entry)
+ {
+       return entry->function == 7 && entry->index == 0 &&
+              (entry->ecx & __feature_bit(X86_FEATURE_WAITPKG));
+ }
+ static void clear_waitpkg(struct kvm_cpuid_entry2 *entry)
+ {
+       entry->ecx &= ~__feature_bit(X86_FEATURE_WAITPKG);
+ }
+ static void tdx_clear_unsupported_cpuid(struct kvm_cpuid_entry2 *entry)
+ {
+       if (has_tsx(entry))
+               clear_tsx(entry);
+       if (has_waitpkg(entry))
+               clear_waitpkg(entry);
+ }
+ static bool tdx_unsupported_cpuid(const struct kvm_cpuid_entry2 *entry)
+ {
+       return has_tsx(entry) || has_waitpkg(entry);
+ }
+ #define KVM_TDX_CPUID_NO_SUBLEAF      ((__u32)-1)
+ static void td_init_cpuid_entry2(struct kvm_cpuid_entry2 *entry, unsigned char idx)
+ {
+       const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
+       entry->function = (u32)td_conf->cpuid_config_leaves[idx];
+       entry->index = td_conf->cpuid_config_leaves[idx] >> 32;
+       entry->eax = (u32)td_conf->cpuid_config_values[idx][0];
+       entry->ebx = td_conf->cpuid_config_values[idx][0] >> 32;
+       entry->ecx = (u32)td_conf->cpuid_config_values[idx][1];
+       entry->edx = td_conf->cpuid_config_values[idx][1] >> 32;
+       if (entry->index == KVM_TDX_CPUID_NO_SUBLEAF)
+               entry->index = 0;
+       /*
+        * The TDX module doesn't allow configuring the guest phys addr bits
+        * (EAX[23:16]).  However, KVM uses it as an interface to the userspace
+        * to configure the GPAW.  Report these bits as configurable.
+        */
+       if (entry->function == 0x80000008)
+               entry->eax = tdx_set_guest_phys_addr_bits(entry->eax, 0xff);
+       tdx_clear_unsupported_cpuid(entry);
+ }
+ static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
+                            struct kvm_tdx_capabilities *caps)
+ {
+       int i;
+       caps->supported_attrs = tdx_get_supported_attrs(td_conf);
+       if (!caps->supported_attrs)
+               return -EIO;
+       caps->supported_xfam = tdx_get_supported_xfam(td_conf);
+       if (!caps->supported_xfam)
+               return -EIO;
+       caps->cpuid.nent = td_conf->num_cpuid_config;
+       for (i = 0; i < td_conf->num_cpuid_config; i++)
+               td_init_cpuid_entry2(&caps->cpuid.entries[i], i);
+       return 0;
+ }
+ /*
+  * Some SEAMCALLs acquire the TDX module globally, and can fail with
+  * TDX_OPERAND_BUSY.  Use a global mutex to serialize these SEAMCALLs.
+  */
+ static DEFINE_MUTEX(tdx_lock);
+ static atomic_t nr_configured_hkid;
+ static bool tdx_operand_busy(u64 err)
+ {
+       return (err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY;
+ }
+ /*
+  * A per-CPU list of TD vCPUs associated with a given CPU.
+  * Protected by interrupt mask. Only manipulated by the CPU owning this per-CPU
+  * list.
+  * - When a vCPU is loaded onto a CPU, it is removed from the per-CPU list of
+  *   the old CPU during the IPI callback running on the old CPU, and then added
+  *   to the per-CPU list of the new CPU.
+  * - When a TD is tearing down, all vCPUs are disassociated from their current
+  *   running CPUs and removed from the per-CPU list during the IPI callback
+  *   running on those CPUs.
+  * - When a CPU is brought down, traverse the per-CPU list to disassociate all
+  *   associated TD vCPUs and remove them from the per-CPU list.
+  */
+ static DEFINE_PER_CPU(struct list_head, associated_tdvcpus);
+ static __always_inline unsigned long tdvmcall_exit_type(struct kvm_vcpu *vcpu)
+ {
+       return to_tdx(vcpu)->vp_enter_args.r10;
+ }
+ static __always_inline unsigned long tdvmcall_leaf(struct kvm_vcpu *vcpu)
+ {
+       return to_tdx(vcpu)->vp_enter_args.r11;
+ }
+ static __always_inline void tdvmcall_set_return_code(struct kvm_vcpu *vcpu,
+                                                    long val)
+ {
+       to_tdx(vcpu)->vp_enter_args.r10 = val;
+ }
+ static __always_inline void tdvmcall_set_return_val(struct kvm_vcpu *vcpu,
+                                                   unsigned long val)
+ {
+       to_tdx(vcpu)->vp_enter_args.r11 = val;
+ }
+ static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
+ {
+       tdx_guest_keyid_free(kvm_tdx->hkid);
+       kvm_tdx->hkid = -1;
+       atomic_dec(&nr_configured_hkid);
+       misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
+       put_misc_cg(kvm_tdx->misc_cg);
+       kvm_tdx->misc_cg = NULL;
+ }
+ static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
+ {
+       return kvm_tdx->hkid > 0;
+ }
+ static inline void tdx_disassociate_vp(struct kvm_vcpu *vcpu)
+ {
+       lockdep_assert_irqs_disabled();
+       list_del(&to_tdx(vcpu)->cpu_list);
+       /*
+        * Ensure tdx->cpu_list is updated before setting vcpu->cpu to -1,
+        * otherwise, a different CPU can see vcpu->cpu = -1 and add the vCPU
+        * to its list before it's deleted from this CPU's list.
+        */
+       smp_wmb();
+       vcpu->cpu = -1;
+ }
+ static void tdx_clear_page(struct page *page)
+ {
+       const void *zero_page = (const void *) page_to_virt(ZERO_PAGE(0));
+       void *dest = page_to_virt(page);
+       unsigned long i;
+       /*
+        * The page could have been poisoned.  MOVDIR64B also clears
+        * the poison bit so the kernel can safely use the page again.
+        */
+       for (i = 0; i < PAGE_SIZE; i += 64)
+               movdir64b(dest + i, zero_page);
+       /*
+        * MOVDIR64B store uses WC buffer.  Prevent following memory reads
+        * from seeing potentially poisoned cache.
+        */
+       __mb();
+ }
+ static void tdx_no_vcpus_enter_start(struct kvm *kvm)
+ {
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+       lockdep_assert_held_write(&kvm->mmu_lock);
+       WRITE_ONCE(kvm_tdx->wait_for_sept_zap, true);
+       kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
+ }
+ static void tdx_no_vcpus_enter_stop(struct kvm *kvm)
+ {
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+       lockdep_assert_held_write(&kvm->mmu_lock);
+       WRITE_ONCE(kvm_tdx->wait_for_sept_zap, false);
+ }
+ /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
+ static int __tdx_reclaim_page(struct page *page)
+ {
+       u64 err, rcx, rdx, r8;
+       err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8);
+       /*
+        * No need to check for TDX_OPERAND_BUSY; all TD pages are freed
+        * before the HKID is released and control pages have also been
+        * released at this point, so there is no possibility of contention.
+        */
+       if (WARN_ON_ONCE(err)) {
+               pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8);
+               return -EIO;
+       }
+       return 0;
+ }
+ static int tdx_reclaim_page(struct page *page)
+ {
+       int r;
+       r = __tdx_reclaim_page(page);
+       if (!r)
+               tdx_clear_page(page);
+       return r;
+ }
+ /*
+  * Reclaim the TD control page(s) which are crypto-protected by TDX guest's
+  * private KeyID.  Assume the cache associated with the TDX private KeyID has
+  * been flushed.
+  */
+ static void tdx_reclaim_control_page(struct page *ctrl_page)
+ {
+       /*
+        * Leak the page if the kernel failed to reclaim the page.
+        * The kernel cannot use it safely anymore.
+        */
+       if (tdx_reclaim_page(ctrl_page))
+               return;
+       __free_page(ctrl_page);
+ }
+ struct tdx_flush_vp_arg {
+       struct kvm_vcpu *vcpu;
+       u64 err;
+ };
+ static void tdx_flush_vp(void *_arg)
+ {
+       struct tdx_flush_vp_arg *arg = _arg;
+       struct kvm_vcpu *vcpu = arg->vcpu;
+       u64 err;
+       arg->err = 0;
+       lockdep_assert_irqs_disabled();
+       /* Task migration can race with CPU offlining. */
+       if (unlikely(vcpu->cpu != raw_smp_processor_id()))
+               return;
+       /*
+        * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized.  The
+        * list tracking still needs to be updated so that it's correct if/when
+        * the vCPU does get initialized.
+        */
+       if (to_tdx(vcpu)->state != VCPU_TD_STATE_UNINITIALIZED) {
+               /*
+                * No need to retry.  TDX Resources needed for TDH.VP.FLUSH are:
+                * TDVPR as exclusive, TDR as shared, and TDCS as shared.  This
+                * vp flush function is called when destructing vCPU/TD or vCPU
+                * migration.  No other thread uses TDVPR in those cases.
+                */
+               err = tdh_vp_flush(&to_tdx(vcpu)->vp);
+               if (unlikely(err && err != TDX_VCPU_NOT_ASSOCIATED)) {
+                       /*
+                        * This function is called in IPI context. Do not use
+                        * printk to avoid console semaphore.
+                        * The caller prints out the error message, instead.
+                        */
+                       if (err)
+                               arg->err = err;
+               }
+       }
+       tdx_disassociate_vp(vcpu);
+ }
+ static void tdx_flush_vp_on_cpu(struct kvm_vcpu *vcpu)
+ {
+       struct tdx_flush_vp_arg arg = {
+               .vcpu = vcpu,
+       };
+       int cpu = vcpu->cpu;
+       if (unlikely(cpu == -1))
+               return;
+       smp_call_function_single(cpu, tdx_flush_vp, &arg, 1);
+       if (KVM_BUG_ON(arg.err, vcpu->kvm))
+               pr_tdx_error(TDH_VP_FLUSH, arg.err);
+ }
+ void tdx_disable_virtualization_cpu(void)
+ {
+       int cpu = raw_smp_processor_id();
+       struct list_head *tdvcpus = &per_cpu(associated_tdvcpus, cpu);
+       struct tdx_flush_vp_arg arg;
+       struct vcpu_tdx *tdx, *tmp;
+       unsigned long flags;
+       local_irq_save(flags);
+       /* Safe variant needed as tdx_disassociate_vp() deletes the entry. */
+       list_for_each_entry_safe(tdx, tmp, tdvcpus, cpu_list) {
+               arg.vcpu = &tdx->vcpu;
+               tdx_flush_vp(&arg);
+       }
+       local_irq_restore(flags);
+ }
+ #define TDX_SEAMCALL_RETRIES 10000
+ static void smp_func_do_phymem_cache_wb(void *unused)
+ {
+       u64 err = 0;
+       bool resume;
+       int i;
+       /*
+        * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private
+        * KeyID on the package or core.  The TDX module may not finish the
+        * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead.  The
+        * kernel should retry it until it returns success w/o rescheduling.
+        */
+       for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) {
+               resume = !!err;
+               err = tdh_phymem_cache_wb(resume);
+               switch (err) {
+               case TDX_INTERRUPTED_RESUMABLE:
+                       continue;
+               case TDX_NO_HKID_READY_TO_WBCACHE:
+                       err = TDX_SUCCESS; /* Already done by other thread */
+                       fallthrough;
+               default:
+                       goto out;
+               }
+       }
+ out:
+       if (WARN_ON_ONCE(err))
+               pr_tdx_error(TDH_PHYMEM_CACHE_WB, err);
+ }
+ void tdx_mmu_release_hkid(struct kvm *kvm)
+ {
+       bool packages_allocated, targets_allocated;
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+       cpumask_var_t packages, targets;
+       struct kvm_vcpu *vcpu;
+       unsigned long j;
+       int i;
+       u64 err;
+       if (!is_hkid_assigned(kvm_tdx))
+               return;
+       packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
+       targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
+       cpus_read_lock();
+       kvm_for_each_vcpu(j, vcpu, kvm)
+               tdx_flush_vp_on_cpu(vcpu);
+       /*
+        * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock
+        * and can fail with TDX_OPERAND_BUSY when it fails to get the lock.
+        * Multiple TDX guests can be destroyed simultaneously. Take the
+        * mutex to prevent it from getting error.
+        */
+       mutex_lock(&tdx_lock);
+       /*
+        * Releasing HKID is in vm_destroy().
+        * After the above flushing vps, there should be no more vCPU
+        * associations, as all vCPU fds have been released at this stage.
+        */
+       err = tdh_mng_vpflushdone(&kvm_tdx->td);
+       if (err == TDX_FLUSHVP_NOT_DONE)
+               goto out;
+       if (KVM_BUG_ON(err, kvm)) {
+               pr_tdx_error(TDH_MNG_VPFLUSHDONE, err);
+               pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
+                      kvm_tdx->hkid);
+               goto out;
+       }
+       for_each_online_cpu(i) {
+               if (packages_allocated &&
+                   cpumask_test_and_set_cpu(topology_physical_package_id(i),
+                                            packages))
+                       continue;
+               if (targets_allocated)
+                       cpumask_set_cpu(i, targets);
+       }
+       if (targets_allocated)
+               on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true);
+       else
+               on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true);
+       /*
+        * In the case of error in smp_func_do_phymem_cache_wb(), the following
+        * tdh_mng_key_freeid() will fail.
+        */
+       err = tdh_mng_key_freeid(&kvm_tdx->td);
+       if (KVM_BUG_ON(err, kvm)) {
+               pr_tdx_error(TDH_MNG_KEY_FREEID, err);
+               pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
+                      kvm_tdx->hkid);
+       } else {
+               tdx_hkid_free(kvm_tdx);
+       }
+ out:
+       mutex_unlock(&tdx_lock);
+       cpus_read_unlock();
+       free_cpumask_var(targets);
+       free_cpumask_var(packages);
+ }
+ static void tdx_reclaim_td_control_pages(struct kvm *kvm)
+ {
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+       u64 err;
+       int i;
+       /*
+        * tdx_mmu_release_hkid() failed to reclaim HKID.  Something went wrong
+        * heavily with TDX module.  Give up freeing TD pages.  As the function
+        * already warned, don't warn it again.
+        */
+       if (is_hkid_assigned(kvm_tdx))
+               return;
+       if (kvm_tdx->td.tdcs_pages) {
+               for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
+                       if (!kvm_tdx->td.tdcs_pages[i])
+                               continue;
+                       tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]);
+               }
+               kfree(kvm_tdx->td.tdcs_pages);
+               kvm_tdx->td.tdcs_pages = NULL;
+       }
+       if (!kvm_tdx->td.tdr_page)
+               return;
+       if (__tdx_reclaim_page(kvm_tdx->td.tdr_page))
+               return;
+       /*
+        * Use a SEAMCALL to ask the TDX module to flush the cache based on the
+        * KeyID. TDX module may access TDR while operating on TD (Especially
+        * when it is reclaiming TDCS).
+        */
+       err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td);
+       if (KVM_BUG_ON(err, kvm)) {
+               pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
+               return;
+       }
+       tdx_clear_page(kvm_tdx->td.tdr_page);
+       __free_page(kvm_tdx->td.tdr_page);
+       kvm_tdx->td.tdr_page = NULL;
+ }
+ void tdx_vm_destroy(struct kvm *kvm)
+ {
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+       tdx_reclaim_td_control_pages(kvm);
+       kvm_tdx->state = TD_STATE_UNINITIALIZED;
+ }
+ static int tdx_do_tdh_mng_key_config(void *param)
+ {
+       struct kvm_tdx *kvm_tdx = param;
+       u64 err;
+       /* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */
+       err = tdh_mng_key_config(&kvm_tdx->td);
+       if (KVM_BUG_ON(err, &kvm_tdx->kvm)) {
+               pr_tdx_error(TDH_MNG_KEY_CONFIG, err);
+               return -EIO;
+       }
+       return 0;
+ }
+ int tdx_vm_init(struct kvm *kvm)
+ {
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+       kvm->arch.has_protected_state = true;
+       kvm->arch.has_private_mem = true;
+       kvm->arch.disabled_quirks |= KVM_X86_QUIRK_IGNORE_GUEST_PAT;
+       /*
+        * Because guest TD is protected, VMM can't parse the instruction in TD.
+        * Instead, guest uses MMIO hypercall.  For unmodified device driver,
+        * #VE needs to be injected for MMIO and #VE handler in TD converts MMIO
+        * instruction into MMIO hypercall.
+        *
+        * SPTE value for MMIO needs to be setup so that #VE is injected into
+        * TD instead of triggering EPT MISCONFIG.
+        * - RWX=0 so that EPT violation is triggered.
+        * - suppress #VE bit is cleared to inject #VE.
+        */
+       kvm_mmu_set_mmio_spte_value(kvm, 0);
+       /*
+        * TDX has its own limit of maximum vCPUs it can support for all
+        * TDX guests in addition to KVM_MAX_VCPUS.  TDX module reports
+        * such limit via the MAX_VCPU_PER_TD global metadata.  In
+        * practice, it reflects the number of logical CPUs that ALL
+        * platforms that the TDX module supports can possibly have.
+        *
+        * Limit TDX guest's maximum vCPUs to the number of logical CPUs
+        * the platform has.  Simply forwarding the MAX_VCPU_PER_TD to
+        * userspace would result in an unpredictable ABI.
+        */
+       kvm->max_vcpus = min_t(int, kvm->max_vcpus, num_present_cpus());
+       kvm_tdx->state = TD_STATE_UNINITIALIZED;
+       return 0;
+ }
+ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
+ {
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+       struct vcpu_tdx *tdx = to_tdx(vcpu);
+       if (kvm_tdx->state != TD_STATE_INITIALIZED)
+               return -EIO;
+       /*
+        * TDX module mandates APICv, which requires an in-kernel local APIC.
+        * Disallow an in-kernel I/O APIC, because level-triggered interrupts
+        * and thus the I/O APIC as a whole can't be faithfully emulated in KVM.
+        */
+       if (!irqchip_split(vcpu->kvm))
+               return -EINVAL;
+       fpstate_set_confidential(&vcpu->arch.guest_fpu);
+       vcpu->arch.apic->guest_apic_protected = true;
+       INIT_LIST_HEAD(&tdx->vt.pi_wakeup_list);
+       vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX;
+       vcpu->arch.switch_db_regs = KVM_DEBUGREG_AUTO_SWITCH;
+       vcpu->arch.cr0_guest_owned_bits = -1ul;
+       vcpu->arch.cr4_guest_owned_bits = -1ul;
+       /* KVM can't change TSC offset/multiplier as TDX module manages them. */
+       vcpu->arch.guest_tsc_protected = true;
+       vcpu->arch.tsc_offset = kvm_tdx->tsc_offset;
+       vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset;
+       vcpu->arch.tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
+       vcpu->arch.l1_tsc_scaling_ratio = kvm_tdx->tsc_multiplier;
+       vcpu->arch.guest_state_protected =
+               !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTR_DEBUG);
+       if ((kvm_tdx->xfam & XFEATURE_MASK_XTILE) == XFEATURE_MASK_XTILE)
+               vcpu->arch.xfd_no_write_intercept = true;
+       tdx->vt.pi_desc.nv = POSTED_INTR_VECTOR;
+       __pi_set_sn(&tdx->vt.pi_desc);
+       tdx->state = VCPU_TD_STATE_UNINITIALIZED;
+       return 0;
+ }
+ void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+ {
+       struct vcpu_tdx *tdx = to_tdx(vcpu);
+       vmx_vcpu_pi_load(vcpu, cpu);
+       if (vcpu->cpu == cpu || !is_hkid_assigned(to_kvm_tdx(vcpu->kvm)))
+               return;
+       tdx_flush_vp_on_cpu(vcpu);
+       KVM_BUG_ON(cpu != raw_smp_processor_id(), vcpu->kvm);
+       local_irq_disable();
+       /*
+        * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure
+        * vcpu->cpu is read before tdx->cpu_list.
+        */
+       smp_rmb();
+       list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu));
+       local_irq_enable();
+ }
+ bool tdx_interrupt_allowed(struct kvm_vcpu *vcpu)
+ {
+       /*
+        * KVM can't get the interrupt status of TDX guest and it assumes
+        * interrupt is always allowed unless TDX guest calls TDVMCALL with HLT,
+        * which passes the interrupt blocked flag.
+        */
+       return vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
+              !to_tdx(vcpu)->vp_enter_args.r12;
+ }
+ bool tdx_protected_apic_has_interrupt(struct kvm_vcpu *vcpu)
+ {
+       u64 vcpu_state_details;
+       if (pi_has_pending_interrupt(vcpu))
+               return true;
+       /*
+        * Only check RVI pending for HALTED case with IRQ enabled.
+        * For non-HLT cases, KVM doesn't care about STI/SS shadows.  And if the
+        * interrupt was pending before TD exit, then it _must_ be blocked,
+        * otherwise the interrupt would have been serviced at the instruction
+        * boundary.
+        */
+       if (vmx_get_exit_reason(vcpu).basic != EXIT_REASON_HLT ||
+           to_tdx(vcpu)->vp_enter_args.r12)
+               return false;
+       vcpu_state_details =
+               td_state_non_arch_read64(to_tdx(vcpu), TD_VCPU_STATE_DETAILS_NON_ARCH);
+       return tdx_vcpu_state_details_intr_pending(vcpu_state_details);
+ }
+ /*
+  * Compared to vmx_prepare_switch_to_guest(), there is not much to do
+  * as SEAMCALL/SEAMRET calls take care of most of save and restore.
+  */
+ void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_vt *vt = to_vt(vcpu);
+       if (vt->guest_state_loaded)
+               return;
+       if (likely(is_64bit_mm(current->mm)))
+               vt->msr_host_kernel_gs_base = current->thread.gsbase;
+       else
+               vt->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
+       vt->host_debugctlmsr = get_debugctlmsr();
+       vt->guest_state_loaded = true;
+ }
+ struct tdx_uret_msr {
+       u32 msr;
+       unsigned int slot;
+       u64 defval;
+ };
+ static struct tdx_uret_msr tdx_uret_msrs[] = {
+       {.msr = MSR_SYSCALL_MASK, .defval = 0x20200 },
+       {.msr = MSR_STAR,},
+       {.msr = MSR_LSTAR,},
+       {.msr = MSR_TSC_AUX,},
+ };
+ static void tdx_user_return_msr_update_cache(void)
+ {
+       int i;
+       for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++)
+               kvm_user_return_msr_update_cache(tdx_uret_msrs[i].slot,
+                                                tdx_uret_msrs[i].defval);
+ }
+ static void tdx_prepare_switch_to_host(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_vt *vt = to_vt(vcpu);
+       struct vcpu_tdx *tdx = to_tdx(vcpu);
+       if (!vt->guest_state_loaded)
+               return;
+       ++vcpu->stat.host_state_reload;
+       wrmsrl(MSR_KERNEL_GS_BASE, vt->msr_host_kernel_gs_base);
+       if (tdx->guest_entered) {
+               tdx_user_return_msr_update_cache();
+               tdx->guest_entered = false;
+       }
+       vt->guest_state_loaded = false;
+ }
+ void tdx_vcpu_put(struct kvm_vcpu *vcpu)
+ {
+       vmx_vcpu_pi_put(vcpu);
+       tdx_prepare_switch_to_host(vcpu);
+ }
+ void tdx_vcpu_free(struct kvm_vcpu *vcpu)
+ {
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+       struct vcpu_tdx *tdx = to_tdx(vcpu);
+       int i;
+       /*
+        * It is not possible to reclaim pages while hkid is assigned. It might
+        * be assigned if:
+        * 1. the TD VM is being destroyed but freeing hkid failed, in which
+        * case the pages are leaked
+        * 2. TD VCPU creation failed and this on the error path, in which case
+        * there is nothing to do anyway
+        */
+       if (is_hkid_assigned(kvm_tdx))
+               return;
+       if (tdx->vp.tdcx_pages) {
+               for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
+                       if (tdx->vp.tdcx_pages[i])
+                               tdx_reclaim_control_page(tdx->vp.tdcx_pages[i]);
+               }
+               kfree(tdx->vp.tdcx_pages);
+               tdx->vp.tdcx_pages = NULL;
+       }
+       if (tdx->vp.tdvpr_page) {
+               tdx_reclaim_control_page(tdx->vp.tdvpr_page);
+               tdx->vp.tdvpr_page = 0;
+       }
+       tdx->state = VCPU_TD_STATE_UNINITIALIZED;
+ }
+ int tdx_vcpu_pre_run(struct kvm_vcpu *vcpu)
+ {
+       if (unlikely(to_tdx(vcpu)->state != VCPU_TD_STATE_INITIALIZED ||
+                    to_kvm_tdx(vcpu->kvm)->state != TD_STATE_RUNNABLE))
+               return -EINVAL;
+       return 1;
+ }
+ static __always_inline u32 tdcall_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
+ {
+       switch (tdvmcall_leaf(vcpu)) {
+       case EXIT_REASON_CPUID:
+       case EXIT_REASON_HLT:
+       case EXIT_REASON_IO_INSTRUCTION:
+       case EXIT_REASON_MSR_READ:
+       case EXIT_REASON_MSR_WRITE:
+               return tdvmcall_leaf(vcpu);
+       case EXIT_REASON_EPT_VIOLATION:
+               return EXIT_REASON_EPT_MISCONFIG;
+       default:
+               break;
+       }
+       return EXIT_REASON_TDCALL;
+ }
+ static __always_inline u32 tdx_to_vmx_exit_reason(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_tdx *tdx = to_tdx(vcpu);
+       u32 exit_reason;
+       switch (tdx->vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) {
+       case TDX_SUCCESS:
+       case TDX_NON_RECOVERABLE_VCPU:
+       case TDX_NON_RECOVERABLE_TD:
+       case TDX_NON_RECOVERABLE_TD_NON_ACCESSIBLE:
+       case TDX_NON_RECOVERABLE_TD_WRONG_APIC_MODE:
+               break;
+       default:
+               return -1u;
+       }
+       exit_reason = tdx->vp_enter_ret;
+       switch (exit_reason) {
+       case EXIT_REASON_TDCALL:
+               if (tdvmcall_exit_type(vcpu))
+                       return EXIT_REASON_VMCALL;
+               return tdcall_to_vmx_exit_reason(vcpu);
+       case EXIT_REASON_EPT_MISCONFIG:
+               /*
+                * Defer KVM_BUG_ON() until tdx_handle_exit() because this is in
+                * non-instrumentable code with interrupts disabled.
+                */
+               return -1u;
+       default:
+               break;
+       }
+       return exit_reason;
+ }
+ static noinstr void tdx_vcpu_enter_exit(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_tdx *tdx = to_tdx(vcpu);
+       struct vcpu_vt *vt = to_vt(vcpu);
+       guest_state_enter_irqoff();
+       tdx->vp_enter_ret = tdh_vp_enter(&tdx->vp, &tdx->vp_enter_args);
+       vt->exit_reason.full = tdx_to_vmx_exit_reason(vcpu);
+       vt->exit_qualification = tdx->vp_enter_args.rcx;
+       tdx->ext_exit_qualification = tdx->vp_enter_args.rdx;
+       tdx->exit_gpa = tdx->vp_enter_args.r8;
+       vt->exit_intr_info = tdx->vp_enter_args.r9;
+       vmx_handle_nmi(vcpu);
+       guest_state_exit_irqoff();
+ }
+ static bool tdx_failed_vmentry(struct kvm_vcpu *vcpu)
+ {
+       return vmx_get_exit_reason(vcpu).failed_vmentry &&
+              vmx_get_exit_reason(vcpu).full != -1u;
+ }
+ static fastpath_t tdx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+ {
+       u64 vp_enter_ret = to_tdx(vcpu)->vp_enter_ret;
+       /*
+        * TDX_OPERAND_BUSY could be returned for SEPT due to 0-step mitigation
+        * or for TD EPOCH due to contention with TDH.MEM.TRACK on TDH.VP.ENTER.
+        *
+        * When KVM requests KVM_REQ_OUTSIDE_GUEST_MODE, which has both
+        * KVM_REQUEST_WAIT and KVM_REQUEST_NO_ACTION set, it requires target
+        * vCPUs leaving fastpath so that interrupt can be enabled to ensure the
+        * IPIs can be delivered. Return EXIT_FASTPATH_EXIT_HANDLED instead of
+        * EXIT_FASTPATH_REENTER_GUEST to exit fastpath, otherwise, the
+        * requester may be blocked endlessly.
+        */
+       if (unlikely(tdx_operand_busy(vp_enter_ret)))
+               return EXIT_FASTPATH_EXIT_HANDLED;
+       return EXIT_FASTPATH_NONE;
+ }
+ #define TDX_REGS_AVAIL_SET    (BIT_ULL(VCPU_EXREG_EXIT_INFO_1) | \
+                                BIT_ULL(VCPU_EXREG_EXIT_INFO_2) | \
+                                BIT_ULL(VCPU_REGS_RAX) | \
+                                BIT_ULL(VCPU_REGS_RBX) | \
+                                BIT_ULL(VCPU_REGS_RCX) | \
+                                BIT_ULL(VCPU_REGS_RDX) | \
+                                BIT_ULL(VCPU_REGS_RBP) | \
+                                BIT_ULL(VCPU_REGS_RSI) | \
+                                BIT_ULL(VCPU_REGS_RDI) | \
+                                BIT_ULL(VCPU_REGS_R8) | \
+                                BIT_ULL(VCPU_REGS_R9) | \
+                                BIT_ULL(VCPU_REGS_R10) | \
+                                BIT_ULL(VCPU_REGS_R11) | \
+                                BIT_ULL(VCPU_REGS_R12) | \
+                                BIT_ULL(VCPU_REGS_R13) | \
+                                BIT_ULL(VCPU_REGS_R14) | \
+                                BIT_ULL(VCPU_REGS_R15))
+ static void tdx_load_host_xsave_state(struct kvm_vcpu *vcpu)
+ {
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+       /*
+        * All TDX hosts support PKRU; but even if they didn't,
+        * vcpu->arch.host_pkru would be 0 and the wrpkru would be
+        * skipped.
+        */
+       if (vcpu->arch.host_pkru != 0)
+               wrpkru(vcpu->arch.host_pkru);
+       if (kvm_host.xcr0 != (kvm_tdx->xfam & kvm_caps.supported_xcr0))
+               xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
+       /*
+        * Likewise, even if a TDX hosts didn't support XSS both arms of
+        * the comparison would be 0 and the wrmsrl would be skipped.
+        */
+       if (kvm_host.xss != (kvm_tdx->xfam & kvm_caps.supported_xss))
+               wrmsrl(MSR_IA32_XSS, kvm_host.xss);
+ }
+ #define TDX_DEBUGCTL_PRESERVED (DEBUGCTLMSR_BTF | \
+                               DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI | \
+                               DEBUGCTLMSR_FREEZE_IN_SMM)
+ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
+ {
+       struct vcpu_tdx *tdx = to_tdx(vcpu);
+       struct vcpu_vt *vt = to_vt(vcpu);
+       /*
+        * force_immediate_exit requires vCPU entering for events injection with
+        * an immediately exit followed. But The TDX module doesn't guarantee
+        * entry, it's already possible for KVM to _think_ it completely entry
+        * to the guest without actually having done so.
+        * Since KVM never needs to force an immediate exit for TDX, and can't
+        * do direct injection, just warn on force_immediate_exit.
+        */
+       WARN_ON_ONCE(force_immediate_exit);
+       /*
+        * Wait until retry of SEPT-zap-related SEAMCALL completes before
+        * allowing vCPU entry to avoid contention with tdh_vp_enter() and
+        * TDCALLs.
+        */
+       if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap)))
+               return EXIT_FASTPATH_EXIT_HANDLED;
+       trace_kvm_entry(vcpu, force_immediate_exit);
+       if (pi_test_on(&vt->pi_desc)) {
+               apic->send_IPI_self(POSTED_INTR_VECTOR);
+               if (pi_test_pir(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTT) &
+                              APIC_VECTOR_MASK, &vt->pi_desc))
+                       kvm_wait_lapic_expire(vcpu);
+       }
+       tdx_vcpu_enter_exit(vcpu);
+       if (vt->host_debugctlmsr & ~TDX_DEBUGCTL_PRESERVED)
+               update_debugctlmsr(vt->host_debugctlmsr);
+       tdx_load_host_xsave_state(vcpu);
+       tdx->guest_entered = true;
+       vcpu->arch.regs_avail &= TDX_REGS_AVAIL_SET;
+       if (unlikely(tdx->vp_enter_ret == EXIT_REASON_EPT_MISCONFIG))
+               return EXIT_FASTPATH_NONE;
+       if (unlikely((tdx->vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR))
+               return EXIT_FASTPATH_NONE;
+       if (unlikely(vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MCE_DURING_VMENTRY))
+               kvm_machine_check();
+       trace_kvm_exit(vcpu, KVM_ISA_VMX);
+       if (unlikely(tdx_failed_vmentry(vcpu)))
+               return EXIT_FASTPATH_NONE;
+       return tdx_exit_handlers_fastpath(vcpu);
+ }
+ void tdx_inject_nmi(struct kvm_vcpu *vcpu)
+ {
+       ++vcpu->stat.nmi_injections;
+       td_management_write8(to_tdx(vcpu), TD_VCPU_PEND_NMI, 1);
+       /*
+        * From KVM's perspective, NMI injection is completed right after
+        * writing to PEND_NMI.  KVM doesn't care whether an NMI is injected by
+        * the TDX module or not.
+        */
+       vcpu->arch.nmi_injected = false;
+       /*
+        * TDX doesn't support KVM to request NMI window exit.  If there is
+        * still a pending vNMI, KVM is not able to inject it along with the
+        * one pending in TDX module in a back-to-back way.  Since the previous
+        * vNMI is still pending in TDX module, i.e. it has not been delivered
+        * to TDX guest yet, it's OK to collapse the pending vNMI into the
+        * previous one.  The guest is expected to handle all the NMI sources
+        * when handling the first vNMI.
+        */
+       vcpu->arch.nmi_pending = 0;
+ }
+ static int tdx_handle_exception_nmi(struct kvm_vcpu *vcpu)
+ {
+       u32 intr_info = vmx_get_intr_info(vcpu);
+       /*
+        * Machine checks are handled by handle_exception_irqoff(), or by
+        * tdx_handle_exit() with TDX_NON_RECOVERABLE set if a #MC occurs on
+        * VM-Entry.  NMIs are handled by tdx_vcpu_enter_exit().
+        */
+       if (is_nmi(intr_info) || is_machine_check(intr_info))
+               return 1;
+       vcpu->run->exit_reason = KVM_EXIT_EXCEPTION;
+       vcpu->run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
+       vcpu->run->ex.error_code = 0;
+       return 0;
+ }
+ static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
+ {
+       tdvmcall_set_return_code(vcpu, vcpu->run->hypercall.ret);
+       return 1;
+ }
+ static int tdx_emulate_vmcall(struct kvm_vcpu *vcpu)
+ {
+       kvm_rax_write(vcpu, to_tdx(vcpu)->vp_enter_args.r10);
+       kvm_rbx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r11);
+       kvm_rcx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r12);
+       kvm_rdx_write(vcpu, to_tdx(vcpu)->vp_enter_args.r13);
+       kvm_rsi_write(vcpu, to_tdx(vcpu)->vp_enter_args.r14);
+       return __kvm_emulate_hypercall(vcpu, 0, complete_hypercall_exit);
+ }
+ /*
+  * Split into chunks and check interrupt pending between chunks.  This allows
+  * for timely injection of interrupts to prevent issues with guest lockup
+  * detection.
+  */
+ #define TDX_MAP_GPA_MAX_LEN (2 * 1024 * 1024)
+ static void __tdx_map_gpa(struct vcpu_tdx *tdx);
+ static int tdx_complete_vmcall_map_gpa(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_tdx *tdx = to_tdx(vcpu);
+       if (vcpu->run->hypercall.ret) {
+               tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+               tdx->vp_enter_args.r11 = tdx->map_gpa_next;
+               return 1;
+       }
+       tdx->map_gpa_next += TDX_MAP_GPA_MAX_LEN;
+       if (tdx->map_gpa_next >= tdx->map_gpa_end)
+               return 1;
+       /*
+        * Stop processing the remaining part if there is a pending interrupt,
+        * which could be qualified to deliver.  Skip checking pending RVI for
+        * TDVMCALL_MAP_GPA, see comments in tdx_protected_apic_has_interrupt().
+        */
+       if (kvm_vcpu_has_events(vcpu)) {
+               tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_RETRY);
+               tdx->vp_enter_args.r11 = tdx->map_gpa_next;
+               return 1;
+       }
+       __tdx_map_gpa(tdx);
+       return 0;
+ }
+ static void __tdx_map_gpa(struct vcpu_tdx *tdx)
+ {
+       u64 gpa = tdx->map_gpa_next;
+       u64 size = tdx->map_gpa_end - tdx->map_gpa_next;
+       if (size > TDX_MAP_GPA_MAX_LEN)
+               size = TDX_MAP_GPA_MAX_LEN;
+       tdx->vcpu.run->exit_reason       = KVM_EXIT_HYPERCALL;
+       tdx->vcpu.run->hypercall.nr      = KVM_HC_MAP_GPA_RANGE;
+       /*
+        * In principle this should have been -KVM_ENOSYS, but userspace (QEMU <=9.2)
+        * assumed that vcpu->run->hypercall.ret is never changed by KVM and thus that
+        * it was always zero on KVM_EXIT_HYPERCALL.  Since KVM is now overwriting
+        * vcpu->run->hypercall.ret, ensuring that it is zero to not break QEMU.
+        */
+       tdx->vcpu.run->hypercall.ret = 0;
+       tdx->vcpu.run->hypercall.args[0] = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
+       tdx->vcpu.run->hypercall.args[1] = size / PAGE_SIZE;
+       tdx->vcpu.run->hypercall.args[2] = vt_is_tdx_private_gpa(tdx->vcpu.kvm, gpa) ?
+                                          KVM_MAP_GPA_RANGE_ENCRYPTED :
+                                          KVM_MAP_GPA_RANGE_DECRYPTED;
+       tdx->vcpu.run->hypercall.flags   = KVM_EXIT_HYPERCALL_LONG_MODE;
+       tdx->vcpu.arch.complete_userspace_io = tdx_complete_vmcall_map_gpa;
+ }
+ static int tdx_map_gpa(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_tdx *tdx = to_tdx(vcpu);
+       u64 gpa = tdx->vp_enter_args.r12;
+       u64 size = tdx->vp_enter_args.r13;
+       u64 ret;
+       /*
+        * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires
+        * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE
+        * bit set.  If not, the error code is not defined in GHCI for TDX, use
+        * TDVMCALL_STATUS_INVALID_OPERAND for this case.
+        */
+       if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
+               ret = TDVMCALL_STATUS_INVALID_OPERAND;
+               goto error;
+       }
+       if (gpa + size <= gpa || !kvm_vcpu_is_legal_gpa(vcpu, gpa) ||
+           !kvm_vcpu_is_legal_gpa(vcpu, gpa + size - 1) ||
+           (vt_is_tdx_private_gpa(vcpu->kvm, gpa) !=
+            vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))) {
+               ret = TDVMCALL_STATUS_INVALID_OPERAND;
+               goto error;
+       }
+       if (!PAGE_ALIGNED(gpa) || !PAGE_ALIGNED(size)) {
+               ret = TDVMCALL_STATUS_ALIGN_ERROR;
+               goto error;
+       }
+       tdx->map_gpa_end = gpa + size;
+       tdx->map_gpa_next = gpa;
+       __tdx_map_gpa(tdx);
+       return 0;
+ error:
+       tdvmcall_set_return_code(vcpu, ret);
+       tdx->vp_enter_args.r11 = gpa;
+       return 1;
+ }
+ static int tdx_report_fatal_error(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_tdx *tdx = to_tdx(vcpu);
+       u64 *regs = vcpu->run->system_event.data;
+       u64 *module_regs = &tdx->vp_enter_args.r8;
+       int index = VCPU_REGS_RAX;
+       vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+       vcpu->run->system_event.type = KVM_SYSTEM_EVENT_TDX_FATAL;
+       vcpu->run->system_event.ndata = 16;
+       /* Dump 16 general-purpose registers to userspace in ascending order. */
+       regs[index++] = tdx->vp_enter_ret;
+       regs[index++] = tdx->vp_enter_args.rcx;
+       regs[index++] = tdx->vp_enter_args.rdx;
+       regs[index++] = tdx->vp_enter_args.rbx;
+       regs[index++] = 0;
+       regs[index++] = 0;
+       regs[index++] = tdx->vp_enter_args.rsi;
+       regs[index] = tdx->vp_enter_args.rdi;
+       for (index = 0; index < 8; index++)
+               regs[VCPU_REGS_R8 + index] = module_regs[index];
+       return 0;
+ }
+ static int tdx_emulate_cpuid(struct kvm_vcpu *vcpu)
+ {
+       u32 eax, ebx, ecx, edx;
+       struct vcpu_tdx *tdx = to_tdx(vcpu);
+       /* EAX and ECX for cpuid is stored in R12 and R13. */
+       eax = tdx->vp_enter_args.r12;
+       ecx = tdx->vp_enter_args.r13;
+       kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false);
+       tdx->vp_enter_args.r12 = eax;
+       tdx->vp_enter_args.r13 = ebx;
+       tdx->vp_enter_args.r14 = ecx;
+       tdx->vp_enter_args.r15 = edx;
+       return 1;
+ }
+ static int tdx_complete_pio_out(struct kvm_vcpu *vcpu)
+ {
+       vcpu->arch.pio.count = 0;
+       return 1;
+ }
+ static int tdx_complete_pio_in(struct kvm_vcpu *vcpu)
+ {
+       struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+       unsigned long val = 0;
+       int ret;
+       ret = ctxt->ops->pio_in_emulated(ctxt, vcpu->arch.pio.size,
+                                        vcpu->arch.pio.port, &val, 1);
+       WARN_ON_ONCE(!ret);
+       tdvmcall_set_return_val(vcpu, val);
+       return 1;
+ }
+ static int tdx_emulate_io(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_tdx *tdx = to_tdx(vcpu);
+       struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+       unsigned long val = 0;
+       unsigned int port;
+       u64 size, write;
+       int ret;
+       ++vcpu->stat.io_exits;
+       size = tdx->vp_enter_args.r12;
+       write = tdx->vp_enter_args.r13;
+       port = tdx->vp_enter_args.r14;
+       if ((write != 0 && write != 1) || (size != 1 && size != 2 && size != 4)) {
+               tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+               return 1;
+       }
+       if (write) {
+               val = tdx->vp_enter_args.r15;
+               ret = ctxt->ops->pio_out_emulated(ctxt, size, port, &val, 1);
+       } else {
+               ret = ctxt->ops->pio_in_emulated(ctxt, size, port, &val, 1);
+       }
+       if (!ret)
+               vcpu->arch.complete_userspace_io = write ? tdx_complete_pio_out :
+                                                          tdx_complete_pio_in;
+       else if (!write)
+               tdvmcall_set_return_val(vcpu, val);
+       return ret;
+ }
+ static int tdx_complete_mmio_read(struct kvm_vcpu *vcpu)
+ {
+       unsigned long val = 0;
+       gpa_t gpa;
+       int size;
+       gpa = vcpu->mmio_fragments[0].gpa;
+       size = vcpu->mmio_fragments[0].len;
+       memcpy(&val, vcpu->run->mmio.data, size);
+       tdvmcall_set_return_val(vcpu, val);
+       trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
+       return 1;
+ }
+ static inline int tdx_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, int size,
+                                unsigned long val)
+ {
+       if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
+               trace_kvm_fast_mmio(gpa);
+               return 0;
+       }
+       trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, size, gpa, &val);
+       if (kvm_io_bus_write(vcpu, KVM_MMIO_BUS, gpa, size, &val))
+               return -EOPNOTSUPP;
+       return 0;
+ }
+ static inline int tdx_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, int size)
+ {
+       unsigned long val;
+       if (kvm_io_bus_read(vcpu, KVM_MMIO_BUS, gpa, size, &val))
+               return -EOPNOTSUPP;
+       tdvmcall_set_return_val(vcpu, val);
+       trace_kvm_mmio(KVM_TRACE_MMIO_READ, size, gpa, &val);
+       return 0;
+ }
+ static int tdx_emulate_mmio(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_tdx *tdx = to_tdx(vcpu);
+       int size, write, r;
+       unsigned long val;
+       gpa_t gpa;
+       size = tdx->vp_enter_args.r12;
+       write = tdx->vp_enter_args.r13;
+       gpa = tdx->vp_enter_args.r14;
+       val = write ? tdx->vp_enter_args.r15 : 0;
+       if (size != 1 && size != 2 && size != 4 && size != 8)
+               goto error;
+       if (write != 0 && write != 1)
+               goto error;
+       /*
+        * TDG.VP.VMCALL<MMIO> allows only shared GPA, it makes no sense to
+        * do MMIO emulation for private GPA.
+        */
+       if (vt_is_tdx_private_gpa(vcpu->kvm, gpa) ||
+           vt_is_tdx_private_gpa(vcpu->kvm, gpa + size - 1))
+               goto error;
+       gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
+       if (write)
+               r = tdx_mmio_write(vcpu, gpa, size, val);
+       else
+               r = tdx_mmio_read(vcpu, gpa, size);
+       if (!r)
+               /* Kernel completed device emulation. */
+               return 1;
+       /* Request the device emulation to userspace device model. */
+       vcpu->mmio_is_write = write;
+       if (!write)
+               vcpu->arch.complete_userspace_io = tdx_complete_mmio_read;
+       vcpu->run->mmio.phys_addr = gpa;
+       vcpu->run->mmio.len = size;
+       vcpu->run->mmio.is_write = write;
+       vcpu->run->exit_reason = KVM_EXIT_MMIO;
+       if (write) {
+               memcpy(vcpu->run->mmio.data, &val, size);
+       } else {
+               vcpu->mmio_fragments[0].gpa = gpa;
+               vcpu->mmio_fragments[0].len = size;
+               trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, size, gpa, NULL);
+       }
+       return 0;
+ error:
+       tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+       return 1;
+ }
+ static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_tdx *tdx = to_tdx(vcpu);
+       if (tdx->vp_enter_args.r12)
+               tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+       else {
+               tdx->vp_enter_args.r11 = 0;
+               tdx->vp_enter_args.r13 = 0;
+               tdx->vp_enter_args.r14 = 0;
+       }
+       return 1;
+ }
+ static int handle_tdvmcall(struct kvm_vcpu *vcpu)
+ {
+       switch (tdvmcall_leaf(vcpu)) {
+       case TDVMCALL_MAP_GPA:
+               return tdx_map_gpa(vcpu);
+       case TDVMCALL_REPORT_FATAL_ERROR:
+               return tdx_report_fatal_error(vcpu);
+       case TDVMCALL_GET_TD_VM_CALL_INFO:
+               return tdx_get_td_vm_call_info(vcpu);
+       default:
+               break;
+       }
+       tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+       return 1;
+ }
+ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int pgd_level)
+ {
+       u64 shared_bit = (pgd_level == 5) ? TDX_SHARED_BIT_PWL_5 :
+                         TDX_SHARED_BIT_PWL_4;
+       if (KVM_BUG_ON(shared_bit != kvm_gfn_direct_bits(vcpu->kvm), vcpu->kvm))
+               return;
+       td_vmcs_write64(to_tdx(vcpu), SHARED_EPT_POINTER, root_hpa);
+ }
+ static void tdx_unpin(struct kvm *kvm, struct page *page)
+ {
+       put_page(page);
+ }
+ static int tdx_mem_page_aug(struct kvm *kvm, gfn_t gfn,
+                           enum pg_level level, struct page *page)
+ {
+       int tdx_level = pg_level_to_tdx_sept_level(level);
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+       gpa_t gpa = gfn_to_gpa(gfn);
+       u64 entry, level_state;
+       u64 err;
+       err = tdh_mem_page_aug(&kvm_tdx->td, gpa, tdx_level, page, &entry, &level_state);
+       if (unlikely(tdx_operand_busy(err))) {
+               tdx_unpin(kvm, page);
+               return -EBUSY;
+       }
+       if (KVM_BUG_ON(err, kvm)) {
+               pr_tdx_error_2(TDH_MEM_PAGE_AUG, err, entry, level_state);
+               tdx_unpin(kvm, page);
+               return -EIO;
+       }
+       return 0;
+ }
+ /*
+  * KVM_TDX_INIT_MEM_REGION calls kvm_gmem_populate() to map guest pages; the
+  * callback tdx_gmem_post_populate() then maps pages into private memory.
+  * through the a seamcall TDH.MEM.PAGE.ADD().  The SEAMCALL also requires the
+  * private EPT structures for the page to have been built before, which is
+  * done via kvm_tdp_map_page(). nr_premapped counts the number of pages that
+  * were added to the EPT structures but not added with TDH.MEM.PAGE.ADD().
+  * The counter has to be zero on KVM_TDX_FINALIZE_VM, to ensure that there
+  * are no half-initialized shared EPT pages.
+  */
+ static int tdx_mem_page_record_premap_cnt(struct kvm *kvm, gfn_t gfn,
+                                         enum pg_level level, kvm_pfn_t pfn)
+ {
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+       if (KVM_BUG_ON(kvm->arch.pre_fault_allowed, kvm))
+               return -EINVAL;
+       /* nr_premapped will be decreased when tdh_mem_page_add() is called. */
+       atomic64_inc(&kvm_tdx->nr_premapped);
+       return 0;
+ }
+ int tdx_sept_set_private_spte(struct kvm *kvm, gfn_t gfn,
+                             enum pg_level level, kvm_pfn_t pfn)
+ {
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+       struct page *page = pfn_to_page(pfn);
+       /* TODO: handle large pages. */
+       if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
+               return -EINVAL;
+       /*
+        * Because guest_memfd doesn't support page migration with
+        * a_ops->migrate_folio (yet), no callback is triggered for KVM on page
+        * migration.  Until guest_memfd supports page migration, prevent page
+        * migration.
+        * TODO: Once guest_memfd introduces callback on page migration,
+        * implement it and remove get_page/put_page().
+        */
+       get_page(page);
+       /*
+        * Read 'pre_fault_allowed' before 'kvm_tdx->state'; see matching
+        * barrier in tdx_td_finalize().
+        */
+       smp_rmb();
+       if (likely(kvm_tdx->state == TD_STATE_RUNNABLE))
+               return tdx_mem_page_aug(kvm, gfn, level, page);
+       return tdx_mem_page_record_premap_cnt(kvm, gfn, level, pfn);
+ }
+ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn,
+                                     enum pg_level level, struct page *page)
+ {
+       int tdx_level = pg_level_to_tdx_sept_level(level);
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+       gpa_t gpa = gfn_to_gpa(gfn);
+       u64 err, entry, level_state;
+       /* TODO: handle large pages. */
+       if (KVM_BUG_ON(level != PG_LEVEL_4K, kvm))
+               return -EINVAL;
+       if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm))
+               return -EINVAL;
+       /*
+        * When zapping private page, write lock is held. So no race condition
+        * with other vcpu sept operation.
+        * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs.
+        */
+       err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
+                                 &level_state);
+       if (unlikely(tdx_operand_busy(err))) {
+               /*
+                * The second retry is expected to succeed after kicking off all
+                * other vCPUs and prevent them from invoking TDH.VP.ENTER.
+                */
+               tdx_no_vcpus_enter_start(kvm);
+               err = tdh_mem_page_remove(&kvm_tdx->td, gpa, tdx_level, &entry,
+                                         &level_state);
+               tdx_no_vcpus_enter_stop(kvm);
+       }
+       if (KVM_BUG_ON(err, kvm)) {
+               pr_tdx_error_2(TDH_MEM_PAGE_REMOVE, err, entry, level_state);
+               return -EIO;
+       }
+       err = tdh_phymem_page_wbinvd_hkid((u16)kvm_tdx->hkid, page);
+       if (KVM_BUG_ON(err, kvm)) {
+               pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
+               return -EIO;
+       }
+       tdx_clear_page(page);
+       tdx_unpin(kvm, page);
+       return 0;
+ }
+ int tdx_sept_link_private_spt(struct kvm *kvm, gfn_t gfn,
+                             enum pg_level level, void *private_spt)
+ {
+       int tdx_level = pg_level_to_tdx_sept_level(level);
+       gpa_t gpa = gfn_to_gpa(gfn);
+       struct page *page = virt_to_page(private_spt);
+       u64 err, entry, level_state;
+       err = tdh_mem_sept_add(&to_kvm_tdx(kvm)->td, gpa, tdx_level, page, &entry,
+                              &level_state);
+       if (unlikely(tdx_operand_busy(err)))
+               return -EBUSY;
+       if (KVM_BUG_ON(err, kvm)) {
+               pr_tdx_error_2(TDH_MEM_SEPT_ADD, err, entry, level_state);
+               return -EIO;
+       }
+       return 0;
+ }
+ /*
+  * Check if the error returned from a SEPT zap SEAMCALL is due to that a page is
+  * mapped by KVM_TDX_INIT_MEM_REGION without tdh_mem_page_add() being called
+  * successfully.
+  *
+  * Since tdh_mem_sept_add() must have been invoked successfully before a
+  * non-leaf entry present in the mirrored page table, the SEPT ZAP related
+  * SEAMCALLs should not encounter err TDX_EPT_WALK_FAILED. They should instead
+  * find TDX_EPT_ENTRY_STATE_INCORRECT due to an empty leaf entry found in the
+  * SEPT.
+  *
+  * Further check if the returned entry from SEPT walking is with RWX permissions
+  * to filter out anything unexpected.
+  *
+  * Note: @level is pg_level, not the tdx_level. The tdx_level extracted from
+  * level_state returned from a SEAMCALL error is the same as that passed into
+  * the SEAMCALL.
+  */
+ static int tdx_is_sept_zap_err_due_to_premap(struct kvm_tdx *kvm_tdx, u64 err,
+                                            u64 entry, int level)
+ {
+       if (!err || kvm_tdx->state == TD_STATE_RUNNABLE)
+               return false;
+       if (err != (TDX_EPT_ENTRY_STATE_INCORRECT | TDX_OPERAND_ID_RCX))
+               return false;
+       if ((is_last_spte(entry, level) && (entry & VMX_EPT_RWX_MASK)))
+               return false;
+       return true;
+ }
+ static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn,
+                                    enum pg_level level, struct page *page)
+ {
+       int tdx_level = pg_level_to_tdx_sept_level(level);
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+       gpa_t gpa = gfn_to_gpa(gfn) & KVM_HPAGE_MASK(level);
+       u64 err, entry, level_state;
+       /* For now large page isn't supported yet. */
+       WARN_ON_ONCE(level != PG_LEVEL_4K);
+       err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
+       if (unlikely(tdx_operand_busy(err))) {
+               /* After no vCPUs enter, the second retry is expected to succeed */
+               tdx_no_vcpus_enter_start(kvm);
+               err = tdh_mem_range_block(&kvm_tdx->td, gpa, tdx_level, &entry, &level_state);
+               tdx_no_vcpus_enter_stop(kvm);
+       }
+       if (tdx_is_sept_zap_err_due_to_premap(kvm_tdx, err, entry, level) &&
+           !KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm)) {
+               atomic64_dec(&kvm_tdx->nr_premapped);
+               tdx_unpin(kvm, page);
+               return 0;
+       }
+       if (KVM_BUG_ON(err, kvm)) {
+               pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state);
+               return -EIO;
+       }
+       return 1;
+ }
+ /*
+  * Ensure shared and private EPTs to be flushed on all vCPUs.
+  * tdh_mem_track() is the only caller that increases TD epoch. An increase in
+  * the TD epoch (e.g., to value "N + 1") is successful only if no vCPUs are
+  * running in guest mode with the value "N - 1".
+  *
+  * A successful execution of tdh_mem_track() ensures that vCPUs can only run in
+  * guest mode with TD epoch value "N" if no TD exit occurs after the TD epoch
+  * being increased to "N + 1".
+  *
+  * Kicking off all vCPUs after that further results in no vCPUs can run in guest
+  * mode with TD epoch value "N", which unblocks the next tdh_mem_track() (e.g.
+  * to increase TD epoch to "N + 2").
+  *
+  * TDX module will flush EPT on the next TD enter and make vCPUs to run in
+  * guest mode with TD epoch value "N + 1".
+  *
+  * kvm_make_all_cpus_request() guarantees all vCPUs are out of guest mode by
+  * waiting empty IPI handler ack_kick().
+  *
+  * No action is required to the vCPUs being kicked off since the kicking off
+  * occurs certainly after TD epoch increment and before the next
+  * tdh_mem_track().
+  */
+ static void tdx_track(struct kvm *kvm)
+ {
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+       u64 err;
+       /* If TD isn't finalized, it's before any vcpu running. */
+       if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE))
+               return;
+       lockdep_assert_held_write(&kvm->mmu_lock);
+       err = tdh_mem_track(&kvm_tdx->td);
+       if (unlikely(tdx_operand_busy(err))) {
+               /* After no vCPUs enter, the second retry is expected to succeed */
+               tdx_no_vcpus_enter_start(kvm);
+               err = tdh_mem_track(&kvm_tdx->td);
+               tdx_no_vcpus_enter_stop(kvm);
+       }
+       if (KVM_BUG_ON(err, kvm))
+               pr_tdx_error(TDH_MEM_TRACK, err);
+       kvm_make_all_cpus_request(kvm, KVM_REQ_OUTSIDE_GUEST_MODE);
+ }
+ int tdx_sept_free_private_spt(struct kvm *kvm, gfn_t gfn,
+                             enum pg_level level, void *private_spt)
+ {
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+       /*
+        * free_external_spt() is only called after hkid is freed when TD is
+        * tearing down.
+        * KVM doesn't (yet) zap page table pages in mirror page table while
+        * TD is active, though guest pages mapped in mirror page table could be
+        * zapped during TD is active, e.g. for shared <-> private conversion
+        * and slot move/deletion.
+        */
+       if (KVM_BUG_ON(is_hkid_assigned(kvm_tdx), kvm))
+               return -EINVAL;
+       /*
+        * The HKID assigned to this TD was already freed and cache was
+        * already flushed. We don't have to flush again.
+        */
+       return tdx_reclaim_page(virt_to_page(private_spt));
+ }
+ int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
+                                enum pg_level level, kvm_pfn_t pfn)
+ {
+       struct page *page = pfn_to_page(pfn);
+       int ret;
+       /*
+        * HKID is released after all private pages have been removed, and set
+        * before any might be populated. Warn if zapping is attempted when
+        * there can't be anything populated in the private EPT.
+        */
+       if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm))
+               return -EINVAL;
+       ret = tdx_sept_zap_private_spte(kvm, gfn, level, page);
+       if (ret <= 0)
+               return ret;
+       /*
+        * TDX requires TLB tracking before dropping private page.  Do
+        * it here, although it is also done later.
+        */
+       tdx_track(kvm);
+       return tdx_sept_drop_private_spte(kvm, gfn, level, page);
+ }
+ void tdx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode,
+                          int trig_mode, int vector)
+ {
+       struct kvm_vcpu *vcpu = apic->vcpu;
+       struct vcpu_tdx *tdx = to_tdx(vcpu);
+       /* TDX supports only posted interrupt.  No lapic emulation. */
+       __vmx_deliver_posted_interrupt(vcpu, &tdx->vt.pi_desc, vector);
+       trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector);
+ }
+ static inline bool tdx_is_sept_violation_unexpected_pending(struct kvm_vcpu *vcpu)
+ {
+       u64 eeq_type = to_tdx(vcpu)->ext_exit_qualification & TDX_EXT_EXIT_QUAL_TYPE_MASK;
+       u64 eq = vmx_get_exit_qual(vcpu);
+       if (eeq_type != TDX_EXT_EXIT_QUAL_TYPE_PENDING_EPT_VIOLATION)
+               return false;
++      return !(eq & EPT_VIOLATION_PROT_MASK) && !(eq & EPT_VIOLATION_EXEC_FOR_RING3_LIN);
+ }
+ static int tdx_handle_ept_violation(struct kvm_vcpu *vcpu)
+ {
+       unsigned long exit_qual;
+       gpa_t gpa = to_tdx(vcpu)->exit_gpa;
+       bool local_retry = false;
+       int ret;
+       if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
+               if (tdx_is_sept_violation_unexpected_pending(vcpu)) {
+                       pr_warn("Guest access before accepting 0x%llx on vCPU %d\n",
+                               gpa, vcpu->vcpu_id);
+                       kvm_vm_dead(vcpu->kvm);
+                       return -EIO;
+               }
+               /*
+                * Always treat SEPT violations as write faults.  Ignore the
+                * EXIT_QUALIFICATION reported by TDX-SEAM for SEPT violations.
+                * TD private pages are always RWX in the SEPT tables,
+                * i.e. they're always mapped writable.  Just as importantly,
+                * treating SEPT violations as write faults is necessary to
+                * avoid COW allocations, which will cause TDAUGPAGE failures
+                * due to aliasing a single HPA to multiple GPAs.
+                */
+               exit_qual = EPT_VIOLATION_ACC_WRITE;
+               /* Only private GPA triggers zero-step mitigation */
+               local_retry = true;
+       } else {
+               exit_qual = vmx_get_exit_qual(vcpu);
+               /*
+                * EPT violation due to instruction fetch should never be
+                * triggered from shared memory in TDX guest.  If such EPT
+                * violation occurs, treat it as broken hardware.
+                */
+               if (KVM_BUG_ON(exit_qual & EPT_VIOLATION_ACC_INSTR, vcpu->kvm))
+                       return -EIO;
+       }
+       trace_kvm_page_fault(vcpu, gpa, exit_qual);
+       /*
+        * To minimize TDH.VP.ENTER invocations, retry locally for private GPA
+        * mapping in TDX.
+        *
+        * KVM may return RET_PF_RETRY for private GPA due to
+        * - contentions when atomically updating SPTEs of the mirror page table
+        * - in-progress GFN invalidation or memslot removal.
+        * - TDX_OPERAND_BUSY error from TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD,
+        *   caused by contentions with TDH.VP.ENTER (with zero-step mitigation)
+        *   or certain TDCALLs.
+        *
+        * If TDH.VP.ENTER is invoked more times than the threshold set by the
+        * TDX module before KVM resolves the private GPA mapping, the TDX
+        * module will activate zero-step mitigation during TDH.VP.ENTER. This
+        * process acquires an SEPT tree lock in the TDX module, leading to
+        * further contentions with TDH.MEM.PAGE.AUG or TDH.MEM.SEPT.ADD
+        * operations on other vCPUs.
+        *
+        * Breaking out of local retries for kvm_vcpu_has_events() is for
+        * interrupt injection. kvm_vcpu_has_events() should not see pending
+        * events for TDX. Since KVM can't determine if IRQs (or NMIs) are
+        * blocked by TDs, false positives are inevitable i.e., KVM may re-enter
+        * the guest even if the IRQ/NMI can't be delivered.
+        *
+        * Note: even without breaking out of local retries, zero-step
+        * mitigation may still occur due to
+        * - invoking of TDH.VP.ENTER after KVM_EXIT_MEMORY_FAULT,
+        * - a single RIP causing EPT violations for more GFNs than the
+        *   threshold count.
+        * This is safe, as triggering zero-step mitigation only introduces
+        * contentions to page installation SEAMCALLs on other vCPUs, which will
+        * handle retries locally in their EPT violation handlers.
+        */
+       while (1) {
+               ret = __vmx_handle_ept_violation(vcpu, gpa, exit_qual);
+               if (ret != RET_PF_RETRY || !local_retry)
+                       break;
+               if (kvm_vcpu_has_events(vcpu) || signal_pending(current))
+                       break;
+               if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
+                       ret = -EIO;
+                       break;
+               }
+               cond_resched();
+       }
+       return ret;
+ }
+ int tdx_complete_emulated_msr(struct kvm_vcpu *vcpu, int err)
+ {
+       if (err) {
+               tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+               return 1;
+       }
+       if (vmx_get_exit_reason(vcpu).basic == EXIT_REASON_MSR_READ)
+               tdvmcall_set_return_val(vcpu, kvm_read_edx_eax(vcpu));
+       return 1;
+ }
+ int tdx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t fastpath)
+ {
+       struct vcpu_tdx *tdx = to_tdx(vcpu);
+       u64 vp_enter_ret = tdx->vp_enter_ret;
+       union vmx_exit_reason exit_reason = vmx_get_exit_reason(vcpu);
+       if (fastpath != EXIT_FASTPATH_NONE)
+               return 1;
+       if (unlikely(vp_enter_ret == EXIT_REASON_EPT_MISCONFIG)) {
+               KVM_BUG_ON(1, vcpu->kvm);
+               return -EIO;
+       }
+       /*
+        * Handle TDX SW errors, including TDX_SEAMCALL_UD, TDX_SEAMCALL_GP and
+        * TDX_SEAMCALL_VMFAILINVALID.
+        */
+       if (unlikely((vp_enter_ret & TDX_SW_ERROR) == TDX_SW_ERROR)) {
+               KVM_BUG_ON(!kvm_rebooting, vcpu->kvm);
+               goto unhandled_exit;
+       }
+       if (unlikely(tdx_failed_vmentry(vcpu))) {
+               /*
+                * If the guest state is protected, that means off-TD debug is
+                * not enabled, TDX_NON_RECOVERABLE must be set.
+                */
+               WARN_ON_ONCE(vcpu->arch.guest_state_protected &&
+                               !(vp_enter_ret & TDX_NON_RECOVERABLE));
+               vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+               vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason.full;
+               vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
+               return 0;
+       }
+       if (unlikely(vp_enter_ret & (TDX_ERROR | TDX_NON_RECOVERABLE)) &&
+               exit_reason.basic != EXIT_REASON_TRIPLE_FAULT) {
+               kvm_pr_unimpl("TD vp_enter_ret 0x%llx\n", vp_enter_ret);
+               goto unhandled_exit;
+       }
+       WARN_ON_ONCE(exit_reason.basic != EXIT_REASON_TRIPLE_FAULT &&
+                    (vp_enter_ret & TDX_SEAMCALL_STATUS_MASK) != TDX_SUCCESS);
+       switch (exit_reason.basic) {
+       case EXIT_REASON_TRIPLE_FAULT:
+               vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+               vcpu->mmio_needed = 0;
+               return 0;
+       case EXIT_REASON_EXCEPTION_NMI:
+               return tdx_handle_exception_nmi(vcpu);
+       case EXIT_REASON_EXTERNAL_INTERRUPT:
+               ++vcpu->stat.irq_exits;
+               return 1;
+       case EXIT_REASON_CPUID:
+               return tdx_emulate_cpuid(vcpu);
+       case EXIT_REASON_HLT:
+               return kvm_emulate_halt_noskip(vcpu);
+       case EXIT_REASON_TDCALL:
+               return handle_tdvmcall(vcpu);
+       case EXIT_REASON_VMCALL:
+               return tdx_emulate_vmcall(vcpu);
+       case EXIT_REASON_IO_INSTRUCTION:
+               return tdx_emulate_io(vcpu);
+       case EXIT_REASON_MSR_READ:
+               kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
+               return kvm_emulate_rdmsr(vcpu);
+       case EXIT_REASON_MSR_WRITE:
+               kvm_rcx_write(vcpu, tdx->vp_enter_args.r12);
+               kvm_rax_write(vcpu, tdx->vp_enter_args.r13 & -1u);
+               kvm_rdx_write(vcpu, tdx->vp_enter_args.r13 >> 32);
+               return kvm_emulate_wrmsr(vcpu);
+       case EXIT_REASON_EPT_MISCONFIG:
+               return tdx_emulate_mmio(vcpu);
+       case EXIT_REASON_EPT_VIOLATION:
+               return tdx_handle_ept_violation(vcpu);
+       case EXIT_REASON_OTHER_SMI:
+               /*
+                * Unlike VMX, SMI in SEAM non-root mode (i.e. when
+                * TD guest vCPU is running) will cause VM exit to TDX module,
+                * then SEAMRET to KVM.  Once it exits to KVM, SMI is delivered
+                * and handled by kernel handler right away.
+                *
+                * The Other SMI exit can also be caused by the SEAM non-root
+                * machine check delivered via Machine Check System Management
+                * Interrupt (MSMI), but it has already been handled by the
+                * kernel machine check handler, i.e., the memory page has been
+                * marked as poisoned and it won't be freed to the free list
+                * when the TDX guest is terminated (the TDX module marks the
+                * guest as dead and prevent it from further running when
+                * machine check happens in SEAM non-root).
+                *
+                * - A MSMI will not reach here, it's handled as non_recoverable
+                *   case above.
+                * - If it's not an MSMI, no need to do anything here.
+                */
+               return 1;
+       default:
+               break;
+       }
+ unhandled_exit:
+       vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+       vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
+       vcpu->run->internal.ndata = 2;
+       vcpu->run->internal.data[0] = vp_enter_ret;
+       vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+       return 0;
+ }
+ void tdx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason,
+               u64 *info1, u64 *info2, u32 *intr_info, u32 *error_code)
+ {
+       struct vcpu_tdx *tdx = to_tdx(vcpu);
+       *reason = tdx->vt.exit_reason.full;
+       if (*reason != -1u) {
+               *info1 = vmx_get_exit_qual(vcpu);
+               *info2 = tdx->ext_exit_qualification;
+               *intr_info = vmx_get_intr_info(vcpu);
+       } else {
+               *info1 = 0;
+               *info2 = 0;
+               *intr_info = 0;
+       }
+       *error_code = 0;
+ }
+ bool tdx_has_emulated_msr(u32 index)
+ {
+       switch (index) {
+       case MSR_IA32_UCODE_REV:
+       case MSR_IA32_ARCH_CAPABILITIES:
+       case MSR_IA32_POWER_CTL:
+       case MSR_IA32_CR_PAT:
+       case MSR_MTRRcap:
+       case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000:
+       case MSR_MTRRdefType:
+       case MSR_IA32_TSC_DEADLINE:
+       case MSR_IA32_MISC_ENABLE:
+       case MSR_PLATFORM_INFO:
+       case MSR_MISC_FEATURES_ENABLES:
+       case MSR_IA32_APICBASE:
+       case MSR_EFER:
+       case MSR_IA32_FEAT_CTL:
+       case MSR_IA32_MCG_CAP:
+       case MSR_IA32_MCG_STATUS:
+       case MSR_IA32_MCG_CTL:
+       case MSR_IA32_MCG_EXT_CTL:
+       case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
+       case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
+               /* MSR_IA32_MCx_{CTL, STATUS, ADDR, MISC, CTL2} */
+       case MSR_KVM_POLL_CONTROL:
+               return true;
+       case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
+               /*
+                * x2APIC registers that are virtualized by the CPU can't be
+                * emulated, KVM doesn't have access to the virtual APIC page.
+                */
+               switch (index) {
+               case X2APIC_MSR(APIC_TASKPRI):
+               case X2APIC_MSR(APIC_PROCPRI):
+               case X2APIC_MSR(APIC_EOI):
+               case X2APIC_MSR(APIC_ISR) ... X2APIC_MSR(APIC_ISR + APIC_ISR_NR):
+               case X2APIC_MSR(APIC_TMR) ... X2APIC_MSR(APIC_TMR + APIC_ISR_NR):
+               case X2APIC_MSR(APIC_IRR) ... X2APIC_MSR(APIC_IRR + APIC_ISR_NR):
+                       return false;
+               default:
+                       return true;
+               }
+       default:
+               return false;
+       }
+ }
+ static bool tdx_is_read_only_msr(u32 index)
+ {
+       return  index == MSR_IA32_APICBASE || index == MSR_EFER ||
+               index == MSR_IA32_FEAT_CTL;
+ }
+ int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
+ {
+       switch (msr->index) {
+       case MSR_IA32_FEAT_CTL:
+               /*
+                * MCE and MCA are advertised via cpuid. Guest kernel could
+                * check if LMCE is enabled or not.
+                */
+               msr->data = FEAT_CTL_LOCKED;
+               if (vcpu->arch.mcg_cap & MCG_LMCE_P)
+                       msr->data |= FEAT_CTL_LMCE_ENABLED;
+               return 0;
+       case MSR_IA32_MCG_EXT_CTL:
+               if (!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P))
+                       return 1;
+               msr->data = vcpu->arch.mcg_ext_ctl;
+               return 0;
+       default:
+               if (!tdx_has_emulated_msr(msr->index))
+                       return 1;
+               return kvm_get_msr_common(vcpu, msr);
+       }
+ }
+ int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
+ {
+       switch (msr->index) {
+       case MSR_IA32_MCG_EXT_CTL:
+               if ((!msr->host_initiated && !(vcpu->arch.mcg_cap & MCG_LMCE_P)) ||
+                   (msr->data & ~MCG_EXT_CTL_LMCE_EN))
+                       return 1;
+               vcpu->arch.mcg_ext_ctl = msr->data;
+               return 0;
+       default:
+               if (tdx_is_read_only_msr(msr->index))
+                       return 1;
+               if (!tdx_has_emulated_msr(msr->index))
+                       return 1;
+               return kvm_set_msr_common(vcpu, msr);
+       }
+ }
+ static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
+ {
+       const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
+       struct kvm_tdx_capabilities __user *user_caps;
+       struct kvm_tdx_capabilities *caps = NULL;
+       int ret = 0;
+       /* flags is reserved for future use */
+       if (cmd->flags)
+               return -EINVAL;
+       caps = kmalloc(sizeof(*caps) +
+                      sizeof(struct kvm_cpuid_entry2) * td_conf->num_cpuid_config,
+                      GFP_KERNEL);
+       if (!caps)
+               return -ENOMEM;
+       user_caps = u64_to_user_ptr(cmd->data);
+       if (copy_from_user(caps, user_caps, sizeof(*caps))) {
+               ret = -EFAULT;
+               goto out;
+       }
+       if (caps->cpuid.nent < td_conf->num_cpuid_config) {
+               ret = -E2BIG;
+               goto out;
+       }
+       ret = init_kvm_tdx_caps(td_conf, caps);
+       if (ret)
+               goto out;
+       if (copy_to_user(user_caps, caps, sizeof(*caps))) {
+               ret = -EFAULT;
+               goto out;
+       }
+       if (copy_to_user(user_caps->cpuid.entries, caps->cpuid.entries,
+                        caps->cpuid.nent *
+                        sizeof(caps->cpuid.entries[0])))
+               ret = -EFAULT;
+ out:
+       /* kfree() accepts NULL. */
+       kfree(caps);
+       return ret;
+ }
+ /*
+  * KVM reports guest physical address in CPUID.0x800000008.EAX[23:16], which is
+  * similar to TDX's GPAW. Use this field as the interface for userspace to
+  * configure the GPAW and EPT level for TDs.
+  *
+  * Only values 48 and 52 are supported. Value 52 means GPAW-52 and EPT level
+  * 5, Value 48 means GPAW-48 and EPT level 4. For value 48, GPAW-48 is always
+  * supported. Value 52 is only supported when the platform supports 5 level
+  * EPT.
+  */
+ static int setup_tdparams_eptp_controls(struct kvm_cpuid2 *cpuid,
+                                       struct td_params *td_params)
+ {
+       const struct kvm_cpuid_entry2 *entry;
+       int guest_pa;
+       entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent, 0x80000008, 0);
+       if (!entry)
+               return -EINVAL;
+       guest_pa = tdx_get_guest_phys_addr_bits(entry->eax);
+       if (guest_pa != 48 && guest_pa != 52)
+               return -EINVAL;
+       if (guest_pa == 52 && !cpu_has_vmx_ept_5levels())
+               return -EINVAL;
+       td_params->eptp_controls = VMX_EPTP_MT_WB;
+       if (guest_pa == 52) {
+               td_params->eptp_controls |= VMX_EPTP_PWL_5;
+               td_params->config_flags |= TDX_CONFIG_FLAGS_MAX_GPAW;
+       } else {
+               td_params->eptp_controls |= VMX_EPTP_PWL_4;
+       }
+       return 0;
+ }
+ static int setup_tdparams_cpuids(struct kvm_cpuid2 *cpuid,
+                                struct td_params *td_params)
+ {
+       const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
+       const struct kvm_cpuid_entry2 *entry;
+       struct tdx_cpuid_value *value;
+       int i, copy_cnt = 0;
+       /*
+        * td_params.cpuid_values: The number and the order of cpuid_value must
+        * be same to the one of struct tdsysinfo.{num_cpuid_config, cpuid_configs}
+        * It's assumed that td_params was zeroed.
+        */
+       for (i = 0; i < td_conf->num_cpuid_config; i++) {
+               struct kvm_cpuid_entry2 tmp;
+               td_init_cpuid_entry2(&tmp, i);
+               entry = kvm_find_cpuid_entry2(cpuid->entries, cpuid->nent,
+                                             tmp.function, tmp.index);
+               if (!entry)
+                       continue;
+               if (tdx_unsupported_cpuid(entry))
+                       return -EINVAL;
+               copy_cnt++;
+               value = &td_params->cpuid_values[i];
+               value->eax = entry->eax;
+               value->ebx = entry->ebx;
+               value->ecx = entry->ecx;
+               value->edx = entry->edx;
+               /*
+                * TDX module does not accept nonzero bits 16..23 for the
+                * CPUID[0x80000008].EAX, see setup_tdparams_eptp_controls().
+                */
+               if (tmp.function == 0x80000008)
+                       value->eax = tdx_set_guest_phys_addr_bits(value->eax, 0);
+       }
+       /*
+        * Rely on the TDX module to reject invalid configuration, but it can't
+        * check of leafs that don't have a proper slot in td_params->cpuid_values
+        * to stick then. So fail if there were entries that didn't get copied to
+        * td_params.
+        */
+       if (copy_cnt != cpuid->nent)
+               return -EINVAL;
+       return 0;
+ }
+ static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
+                       struct kvm_tdx_init_vm *init_vm)
+ {
+       const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
+       struct kvm_cpuid2 *cpuid = &init_vm->cpuid;
+       int ret;
+       if (kvm->created_vcpus)
+               return -EBUSY;
+       if (init_vm->attributes & ~tdx_get_supported_attrs(td_conf))
+               return -EINVAL;
+       if (init_vm->xfam & ~tdx_get_supported_xfam(td_conf))
+               return -EINVAL;
+       td_params->max_vcpus = kvm->max_vcpus;
+       td_params->attributes = init_vm->attributes | td_conf->attributes_fixed1;
+       td_params->xfam = init_vm->xfam | td_conf->xfam_fixed1;
+       td_params->config_flags = TDX_CONFIG_FLAGS_NO_RBP_MOD;
+       td_params->tsc_frequency = TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz);
+       ret = setup_tdparams_eptp_controls(cpuid, td_params);
+       if (ret)
+               return ret;
+       ret = setup_tdparams_cpuids(cpuid, td_params);
+       if (ret)
+               return ret;
+ #define MEMCPY_SAME_SIZE(dst, src)                            \
+       do {                                                    \
+               BUILD_BUG_ON(sizeof(dst) != sizeof(src));       \
+               memcpy((dst), (src), sizeof(dst));              \
+       } while (0)
+       MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid);
+       MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner);
+       MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig);
+       return 0;
+ }
+ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params,
+                        u64 *seamcall_err)
+ {
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+       cpumask_var_t packages;
+       struct page **tdcs_pages = NULL;
+       struct page *tdr_page;
+       int ret, i;
+       u64 err, rcx;
+       *seamcall_err = 0;
+       ret = tdx_guest_keyid_alloc();
+       if (ret < 0)
+               return ret;
+       kvm_tdx->hkid = ret;
+       kvm_tdx->misc_cg = get_current_misc_cg();
+       ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1);
+       if (ret)
+               goto free_hkid;
+       ret = -ENOMEM;
+       atomic_inc(&nr_configured_hkid);
+       tdr_page = alloc_page(GFP_KERNEL);
+       if (!tdr_page)
+               goto free_hkid;
+       kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE;
+       /* TDVPS = TDVPR(4K page) + TDCX(multiple 4K pages), -1 for TDVPR. */
+       kvm_tdx->td.tdcx_nr_pages = tdx_sysinfo->td_ctrl.tdvps_base_size / PAGE_SIZE - 1;
+       tdcs_pages = kcalloc(kvm_tdx->td.tdcs_nr_pages, sizeof(*kvm_tdx->td.tdcs_pages),
+                            GFP_KERNEL | __GFP_ZERO);
+       if (!tdcs_pages)
+               goto free_tdr;
+       for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
+               tdcs_pages[i] = alloc_page(GFP_KERNEL);
+               if (!tdcs_pages[i])
+                       goto free_tdcs;
+       }
+       if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
+               goto free_tdcs;
+       cpus_read_lock();
+       /*
+        * Need at least one CPU of the package to be online in order to
+        * program all packages for host key id.  Check it.
+        */
+       for_each_present_cpu(i)
+               cpumask_set_cpu(topology_physical_package_id(i), packages);
+       for_each_online_cpu(i)
+               cpumask_clear_cpu(topology_physical_package_id(i), packages);
+       if (!cpumask_empty(packages)) {
+               ret = -EIO;
+               /*
+                * Because it's hard for human operator to figure out the
+                * reason, warn it.
+                */
+ #define MSG_ALLPKG    "All packages need to have online CPU to create TD. Online CPU and retry.\n"
+               pr_warn_ratelimited(MSG_ALLPKG);
+               goto free_packages;
+       }
+       /*
+        * TDH.MNG.CREATE tries to grab the global TDX module and fails
+        * with TDX_OPERAND_BUSY when it fails to grab.  Take the global
+        * lock to prevent it from failure.
+        */
+       mutex_lock(&tdx_lock);
+       kvm_tdx->td.tdr_page = tdr_page;
+       err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid);
+       mutex_unlock(&tdx_lock);
+       if (err == TDX_RND_NO_ENTROPY) {
+               ret = -EAGAIN;
+               goto free_packages;
+       }
+       if (WARN_ON_ONCE(err)) {
+               pr_tdx_error(TDH_MNG_CREATE, err);
+               ret = -EIO;
+               goto free_packages;
+       }
+       for_each_online_cpu(i) {
+               int pkg = topology_physical_package_id(i);
+               if (cpumask_test_and_set_cpu(pkg, packages))
+                       continue;
+               /*
+                * Program the memory controller in the package with an
+                * encryption key associated to a TDX private host key id
+                * assigned to this TDR.  Concurrent operations on same memory
+                * controller results in TDX_OPERAND_BUSY. No locking needed
+                * beyond the cpus_read_lock() above as it serializes against
+                * hotplug and the first online CPU of the package is always
+                * used. We never have two CPUs in the same socket trying to
+                * program the key.
+                */
+               ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
+                                     kvm_tdx, true);
+               if (ret)
+                       break;
+       }
+       cpus_read_unlock();
+       free_cpumask_var(packages);
+       if (ret) {
+               i = 0;
+               goto teardown;
+       }
+       kvm_tdx->td.tdcs_pages = tdcs_pages;
+       for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
+               err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]);
+               if (err == TDX_RND_NO_ENTROPY) {
+                       /* Here it's hard to allow userspace to retry. */
+                       ret = -EAGAIN;
+                       goto teardown;
+               }
+               if (WARN_ON_ONCE(err)) {
+                       pr_tdx_error(TDH_MNG_ADDCX, err);
+                       ret = -EIO;
+                       goto teardown;
+               }
+       }
+       err = tdh_mng_init(&kvm_tdx->td, __pa(td_params), &rcx);
+       if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_INVALID) {
+               /*
+                * Because a user gives operands, don't warn.
+                * Return a hint to the user because it's sometimes hard for the
+                * user to figure out which operand is invalid.  SEAMCALL status
+                * code includes which operand caused invalid operand error.
+                */
+               *seamcall_err = err;
+               ret = -EINVAL;
+               goto teardown;
+       } else if (WARN_ON_ONCE(err)) {
+               pr_tdx_error_1(TDH_MNG_INIT, err, rcx);
+               ret = -EIO;
+               goto teardown;
+       }
+       return 0;
+       /*
+        * The sequence for freeing resources from a partially initialized TD
+        * varies based on where in the initialization flow failure occurred.
+        * Simply use the full teardown and destroy, which naturally play nice
+        * with partial initialization.
+        */
+ teardown:
+       /* Only free pages not yet added, so start at 'i' */
+       for (; i < kvm_tdx->td.tdcs_nr_pages; i++) {
+               if (tdcs_pages[i]) {
+                       __free_page(tdcs_pages[i]);
+                       tdcs_pages[i] = NULL;
+               }
+       }
+       if (!kvm_tdx->td.tdcs_pages)
+               kfree(tdcs_pages);
+       tdx_mmu_release_hkid(kvm);
+       tdx_reclaim_td_control_pages(kvm);
+       return ret;
+ free_packages:
+       cpus_read_unlock();
+       free_cpumask_var(packages);
+ free_tdcs:
+       for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
+               if (tdcs_pages[i])
+                       __free_page(tdcs_pages[i]);
+       }
+       kfree(tdcs_pages);
+       kvm_tdx->td.tdcs_pages = NULL;
+ free_tdr:
+       if (tdr_page)
+               __free_page(tdr_page);
+       kvm_tdx->td.tdr_page = 0;
+ free_hkid:
+       tdx_hkid_free(kvm_tdx);
+       return ret;
+ }
+ static u64 tdx_td_metadata_field_read(struct kvm_tdx *tdx, u64 field_id,
+                                     u64 *data)
+ {
+       u64 err;
+       err = tdh_mng_rd(&tdx->td, field_id, data);
+       return err;
+ }
+ #define TDX_MD_UNREADABLE_LEAF_MASK   GENMASK(30, 7)
+ #define TDX_MD_UNREADABLE_SUBLEAF_MASK        GENMASK(31, 7)
+ static int tdx_read_cpuid(struct kvm_vcpu *vcpu, u32 leaf, u32 sub_leaf,
+                         bool sub_leaf_set, int *entry_index,
+                         struct kvm_cpuid_entry2 *out)
+ {
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+       u64 field_id = TD_MD_FIELD_ID_CPUID_VALUES;
+       u64 ebx_eax, edx_ecx;
+       u64 err = 0;
+       if (sub_leaf > 0b1111111)
+               return -EINVAL;
+       if (*entry_index >= KVM_MAX_CPUID_ENTRIES)
+               return -EINVAL;
+       if (leaf & TDX_MD_UNREADABLE_LEAF_MASK ||
+           sub_leaf & TDX_MD_UNREADABLE_SUBLEAF_MASK)
+               return -EINVAL;
+       /*
+        * bit 23:17, REVSERVED: reserved, must be 0;
+        * bit 16,    LEAF_31: leaf number bit 31;
+        * bit 15:9,  LEAF_6_0: leaf number bits 6:0, leaf bits 30:7 are
+        *                      implicitly 0;
+        * bit 8,     SUBLEAF_NA: sub-leaf not applicable flag;
+        * bit 7:1,   SUBLEAF_6_0: sub-leaf number bits 6:0. If SUBLEAF_NA is 1,
+        *                         the SUBLEAF_6_0 is all-1.
+        *                         sub-leaf bits 31:7 are implicitly 0;
+        * bit 0,     ELEMENT_I: Element index within field;
+        */
+       field_id |= ((leaf & 0x80000000) ? 1 : 0) << 16;
+       field_id |= (leaf & 0x7f) << 9;
+       if (sub_leaf_set)
+               field_id |= (sub_leaf & 0x7f) << 1;
+       else
+               field_id |= 0x1fe;
+       err = tdx_td_metadata_field_read(kvm_tdx, field_id, &ebx_eax);
+       if (err) //TODO check for specific errors
+               goto err_out;
+       out->eax = (u32) ebx_eax;
+       out->ebx = (u32) (ebx_eax >> 32);
+       field_id++;
+       err = tdx_td_metadata_field_read(kvm_tdx, field_id, &edx_ecx);
+       /*
+        * It's weird that reading edx_ecx fails while reading ebx_eax
+        * succeeded.
+        */
+       if (WARN_ON_ONCE(err))
+               goto err_out;
+       out->ecx = (u32) edx_ecx;
+       out->edx = (u32) (edx_ecx >> 32);
+       out->function = leaf;
+       out->index = sub_leaf;
+       out->flags |= sub_leaf_set ? KVM_CPUID_FLAG_SIGNIFCANT_INDEX : 0;
+       /*
+        * Work around missing support on old TDX modules, fetch
+        * guest maxpa from gfn_direct_bits.
+        */
+       if (leaf == 0x80000008) {
+               gpa_t gpa_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
+               unsigned int g_maxpa = __ffs(gpa_bits) + 1;
+               out->eax = tdx_set_guest_phys_addr_bits(out->eax, g_maxpa);
+       }
+       (*entry_index)++;
+       return 0;
+ err_out:
+       out->eax = 0;
+       out->ebx = 0;
+       out->ecx = 0;
+       out->edx = 0;
+       return -EIO;
+ }
+ static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
+ {
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+       struct kvm_tdx_init_vm *init_vm;
+       struct td_params *td_params = NULL;
+       int ret;
+       BUILD_BUG_ON(sizeof(*init_vm) != 256 + sizeof_field(struct kvm_tdx_init_vm, cpuid));
+       BUILD_BUG_ON(sizeof(struct td_params) != 1024);
+       if (kvm_tdx->state != TD_STATE_UNINITIALIZED)
+               return -EINVAL;
+       if (cmd->flags)
+               return -EINVAL;
+       init_vm = kmalloc(sizeof(*init_vm) +
+                         sizeof(init_vm->cpuid.entries[0]) * KVM_MAX_CPUID_ENTRIES,
+                         GFP_KERNEL);
+       if (!init_vm)
+               return -ENOMEM;
+       if (copy_from_user(init_vm, u64_to_user_ptr(cmd->data), sizeof(*init_vm))) {
+               ret = -EFAULT;
+               goto out;
+       }
+       if (init_vm->cpuid.nent > KVM_MAX_CPUID_ENTRIES) {
+               ret = -E2BIG;
+               goto out;
+       }
+       if (copy_from_user(init_vm->cpuid.entries,
+                          u64_to_user_ptr(cmd->data) + sizeof(*init_vm),
+                          flex_array_size(init_vm, cpuid.entries, init_vm->cpuid.nent))) {
+               ret = -EFAULT;
+               goto out;
+       }
+       if (memchr_inv(init_vm->reserved, 0, sizeof(init_vm->reserved))) {
+               ret = -EINVAL;
+               goto out;
+       }
+       if (init_vm->cpuid.padding) {
+               ret = -EINVAL;
+               goto out;
+       }
+       td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL);
+       if (!td_params) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       ret = setup_tdparams(kvm, td_params, init_vm);
+       if (ret)
+               goto out;
+       ret = __tdx_td_init(kvm, td_params, &cmd->hw_error);
+       if (ret)
+               goto out;
+       kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
+       kvm_tdx->tsc_multiplier = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_MULTIPLIER);
+       kvm_tdx->attributes = td_params->attributes;
+       kvm_tdx->xfam = td_params->xfam;
+       if (td_params->config_flags & TDX_CONFIG_FLAGS_MAX_GPAW)
+               kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_5;
+       else
+               kvm->arch.gfn_direct_bits = TDX_SHARED_BIT_PWL_4;
+       kvm_tdx->state = TD_STATE_INITIALIZED;
+ out:
+       /* kfree() accepts NULL. */
+       kfree(init_vm);
+       kfree(td_params);
+       return ret;
+ }
+ void tdx_flush_tlb_current(struct kvm_vcpu *vcpu)
+ {
+       /*
+        * flush_tlb_current() is invoked when the first time for the vcpu to
+        * run or when root of shared EPT is invalidated.
+        * KVM only needs to flush shared EPT because the TDX module handles TLB
+        * invalidation for private EPT in tdh_vp_enter();
+        *
+        * A single context invalidation for shared EPT can be performed here.
+        * However, this single context invalidation requires the private EPTP
+        * rather than the shared EPTP to flush shared EPT, as shared EPT uses
+        * private EPTP as its ASID for TLB invalidation.
+        *
+        * To avoid reading back private EPTP, perform a global invalidation for
+        * shared EPT instead to keep this function simple.
+        */
+       ept_sync_global();
+ }
+ void tdx_flush_tlb_all(struct kvm_vcpu *vcpu)
+ {
+       /*
+        * TDX has called tdx_track() in tdx_sept_remove_private_spte() to
+        * ensure that private EPT will be flushed on the next TD enter. No need
+        * to call tdx_track() here again even when this callback is a result of
+        * zapping private EPT.
+        *
+        * Due to the lack of the context to determine which EPT has been
+        * affected by zapping, invoke invept() directly here for both shared
+        * EPT and private EPT for simplicity, though it's not necessary for
+        * private EPT.
+        */
+       ept_sync_global();
+ }
+ static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
+ {
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+       guard(mutex)(&kvm->slots_lock);
+       if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
+               return -EINVAL;
+       /*
+        * Pages are pending for KVM_TDX_INIT_MEM_REGION to issue
+        * TDH.MEM.PAGE.ADD().
+        */
+       if (atomic64_read(&kvm_tdx->nr_premapped))
+               return -EINVAL;
+       cmd->hw_error = tdh_mr_finalize(&kvm_tdx->td);
+       if (tdx_operand_busy(cmd->hw_error))
+               return -EBUSY;
+       if (KVM_BUG_ON(cmd->hw_error, kvm)) {
+               pr_tdx_error(TDH_MR_FINALIZE, cmd->hw_error);
+               return -EIO;
+       }
+       kvm_tdx->state = TD_STATE_RUNNABLE;
+       /* TD_STATE_RUNNABLE must be set before 'pre_fault_allowed' */
+       smp_wmb();
+       kvm->arch.pre_fault_allowed = true;
+       return 0;
+ }
+ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
+ {
+       struct kvm_tdx_cmd tdx_cmd;
+       int r;
+       if (copy_from_user(&tdx_cmd, argp, sizeof(struct kvm_tdx_cmd)))
+               return -EFAULT;
+       /*
+        * Userspace should never set hw_error. It is used to fill
+        * hardware-defined error by the kernel.
+        */
+       if (tdx_cmd.hw_error)
+               return -EINVAL;
+       mutex_lock(&kvm->lock);
+       switch (tdx_cmd.id) {
+       case KVM_TDX_CAPABILITIES:
+               r = tdx_get_capabilities(&tdx_cmd);
+               break;
+       case KVM_TDX_INIT_VM:
+               r = tdx_td_init(kvm, &tdx_cmd);
+               break;
+       case KVM_TDX_FINALIZE_VM:
+               r = tdx_td_finalize(kvm, &tdx_cmd);
+               break;
+       default:
+               r = -EINVAL;
+               goto out;
+       }
+       if (copy_to_user(argp, &tdx_cmd, sizeof(struct kvm_tdx_cmd)))
+               r = -EFAULT;
+ out:
+       mutex_unlock(&kvm->lock);
+       return r;
+ }
+ /* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
+ static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
+ {
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+       struct vcpu_tdx *tdx = to_tdx(vcpu);
+       struct page *page;
+       int ret, i;
+       u64 err;
+       page = alloc_page(GFP_KERNEL);
+       if (!page)
+               return -ENOMEM;
+       tdx->vp.tdvpr_page = page;
+       tdx->vp.tdcx_pages = kcalloc(kvm_tdx->td.tdcx_nr_pages, sizeof(*tdx->vp.tdcx_pages),
+                                    GFP_KERNEL);
+       if (!tdx->vp.tdcx_pages) {
+               ret = -ENOMEM;
+               goto free_tdvpr;
+       }
+       for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
+               page = alloc_page(GFP_KERNEL);
+               if (!page) {
+                       ret = -ENOMEM;
+                       goto free_tdcx;
+               }
+               tdx->vp.tdcx_pages[i] = page;
+       }
+       err = tdh_vp_create(&kvm_tdx->td, &tdx->vp);
+       if (KVM_BUG_ON(err, vcpu->kvm)) {
+               ret = -EIO;
+               pr_tdx_error(TDH_VP_CREATE, err);
+               goto free_tdcx;
+       }
+       for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
+               err = tdh_vp_addcx(&tdx->vp, tdx->vp.tdcx_pages[i]);
+               if (KVM_BUG_ON(err, vcpu->kvm)) {
+                       pr_tdx_error(TDH_VP_ADDCX, err);
+                       /*
+                        * Pages already added are reclaimed by the vcpu_free
+                        * method, but the rest are freed here.
+                        */
+                       for (; i < kvm_tdx->td.tdcx_nr_pages; i++) {
+                               __free_page(tdx->vp.tdcx_pages[i]);
+                               tdx->vp.tdcx_pages[i] = NULL;
+                       }
+                       return -EIO;
+               }
+       }
+       err = tdh_vp_init(&tdx->vp, vcpu_rcx, vcpu->vcpu_id);
+       if (KVM_BUG_ON(err, vcpu->kvm)) {
+               pr_tdx_error(TDH_VP_INIT, err);
+               return -EIO;
+       }
+       vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+       return 0;
+ free_tdcx:
+       for (i = 0; i < kvm_tdx->td.tdcx_nr_pages; i++) {
+               if (tdx->vp.tdcx_pages[i])
+                       __free_page(tdx->vp.tdcx_pages[i]);
+               tdx->vp.tdcx_pages[i] = NULL;
+       }
+       kfree(tdx->vp.tdcx_pages);
+       tdx->vp.tdcx_pages = NULL;
+ free_tdvpr:
+       if (tdx->vp.tdvpr_page)
+               __free_page(tdx->vp.tdvpr_page);
+       tdx->vp.tdvpr_page = 0;
+       return ret;
+ }
+ /* Sometimes reads multipple subleafs. Return how many enties were written. */
+ static int tdx_vcpu_get_cpuid_leaf(struct kvm_vcpu *vcpu, u32 leaf, int *entry_index,
+                                  struct kvm_cpuid_entry2 *output_e)
+ {
+       int sub_leaf = 0;
+       int ret;
+       /* First try without a subleaf */
+       ret = tdx_read_cpuid(vcpu, leaf, 0, false, entry_index, output_e);
+       /* If success, or invalid leaf, just give up */
+       if (ret != -EIO)
+               return ret;
+       /*
+        * If the try without a subleaf failed, try reading subleafs until
+        * failure. The TDX module only supports 6 bits of subleaf index.
+        */
+       while (1) {
+               /* Keep reading subleafs until there is a failure. */
+               if (tdx_read_cpuid(vcpu, leaf, sub_leaf, true, entry_index, output_e))
+                       return !sub_leaf;
+               sub_leaf++;
+               output_e++;
+       }
+       return 0;
+ }
+ static int tdx_vcpu_get_cpuid(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
+ {
+       struct kvm_cpuid2 __user *output, *td_cpuid;
+       int r = 0, i = 0, leaf;
+       u32 level;
+       output = u64_to_user_ptr(cmd->data);
+       td_cpuid = kzalloc(sizeof(*td_cpuid) +
+                       sizeof(output->entries[0]) * KVM_MAX_CPUID_ENTRIES,
+                       GFP_KERNEL);
+       if (!td_cpuid)
+               return -ENOMEM;
+       if (copy_from_user(td_cpuid, output, sizeof(*output))) {
+               r = -EFAULT;
+               goto out;
+       }
+       /* Read max CPUID for normal range */
+       if (tdx_vcpu_get_cpuid_leaf(vcpu, 0, &i, &td_cpuid->entries[i])) {
+               r = -EIO;
+               goto out;
+       }
+       level = td_cpuid->entries[0].eax;
+       for (leaf = 1; leaf <= level; leaf++)
+               tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
+       /* Read max CPUID for extended range */
+       if (tdx_vcpu_get_cpuid_leaf(vcpu, 0x80000000, &i, &td_cpuid->entries[i])) {
+               r = -EIO;
+               goto out;
+       }
+       level = td_cpuid->entries[i - 1].eax;
+       for (leaf = 0x80000001; leaf <= level; leaf++)
+               tdx_vcpu_get_cpuid_leaf(vcpu, leaf, &i, &td_cpuid->entries[i]);
+       if (td_cpuid->nent < i)
+               r = -E2BIG;
+       td_cpuid->nent = i;
+       if (copy_to_user(output, td_cpuid, sizeof(*output))) {
+               r = -EFAULT;
+               goto out;
+       }
+       if (r == -E2BIG)
+               goto out;
+       if (copy_to_user(output->entries, td_cpuid->entries,
+                        td_cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
+               r = -EFAULT;
+ out:
+       kfree(td_cpuid);
+       return r;
+ }
+ static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
+ {
+       u64 apic_base;
+       struct vcpu_tdx *tdx = to_tdx(vcpu);
+       int ret;
+       if (cmd->flags)
+               return -EINVAL;
+       if (tdx->state != VCPU_TD_STATE_UNINITIALIZED)
+               return -EINVAL;
+       /*
+        * TDX requires X2APIC, userspace is responsible for configuring guest
+        * CPUID accordingly.
+        */
+       apic_base = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
+               (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0);
+       if (kvm_apic_set_base(vcpu, apic_base, true))
+               return -EINVAL;
+       ret = tdx_td_vcpu_init(vcpu, (u64)cmd->data);
+       if (ret)
+               return ret;
+       td_vmcs_write16(tdx, POSTED_INTR_NV, POSTED_INTR_VECTOR);
+       td_vmcs_write64(tdx, POSTED_INTR_DESC_ADDR, __pa(&tdx->vt.pi_desc));
+       td_vmcs_setbit32(tdx, PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_POSTED_INTR);
+       tdx->state = VCPU_TD_STATE_INITIALIZED;
+       return 0;
+ }
+ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+ {
+       /*
+        * Yell on INIT, as TDX doesn't support INIT, i.e. KVM should drop all
+        * INIT events.
+        *
+        * Defer initializing vCPU for RESET state until KVM_TDX_INIT_VCPU, as
+        * userspace needs to define the vCPU model before KVM can initialize
+        * vCPU state, e.g. to enable x2APIC.
+        */
+       WARN_ON_ONCE(init_event);
+ }
+ struct tdx_gmem_post_populate_arg {
+       struct kvm_vcpu *vcpu;
+       __u32 flags;
+ };
+ static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
+                                 void __user *src, int order, void *_arg)
+ {
+       u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS;
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+       struct tdx_gmem_post_populate_arg *arg = _arg;
+       struct kvm_vcpu *vcpu = arg->vcpu;
+       gpa_t gpa = gfn_to_gpa(gfn);
+       u8 level = PG_LEVEL_4K;
+       struct page *src_page;
+       int ret, i;
+       u64 err, entry, level_state;
+       /*
+        * Get the source page if it has been faulted in. Return failure if the
+        * source page has been swapped out or unmapped in primary memory.
+        */
+       ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page);
+       if (ret < 0)
+               return ret;
+       if (ret != 1)
+               return -ENOMEM;
+       ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level);
+       if (ret < 0)
+               goto out;
+       /*
+        * The private mem cannot be zapped after kvm_tdp_map_page()
+        * because all paths are covered by slots_lock and the
+        * filemap invalidate lock.  Check that they are indeed enough.
+        */
+       if (IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
+               scoped_guard(read_lock, &kvm->mmu_lock) {
+                       if (KVM_BUG_ON(!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa), kvm)) {
+                               ret = -EIO;
+                               goto out;
+                       }
+               }
+       }
+       ret = 0;
+       err = tdh_mem_page_add(&kvm_tdx->td, gpa, pfn_to_page(pfn),
+                              src_page, &entry, &level_state);
+       if (err) {
+               ret = unlikely(tdx_operand_busy(err)) ? -EBUSY : -EIO;
+               goto out;
+       }
+       if (!KVM_BUG_ON(!atomic64_read(&kvm_tdx->nr_premapped), kvm))
+               atomic64_dec(&kvm_tdx->nr_premapped);
+       if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) {
+               for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
+                       err = tdh_mr_extend(&kvm_tdx->td, gpa + i, &entry,
+                                           &level_state);
+                       if (err) {
+                               ret = -EIO;
+                               break;
+                       }
+               }
+       }
+ out:
+       put_page(src_page);
+       return ret;
+ }
+ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
+ {
+       struct vcpu_tdx *tdx = to_tdx(vcpu);
+       struct kvm *kvm = vcpu->kvm;
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+       struct kvm_tdx_init_mem_region region;
+       struct tdx_gmem_post_populate_arg arg;
+       long gmem_ret;
+       int ret;
+       if (tdx->state != VCPU_TD_STATE_INITIALIZED)
+               return -EINVAL;
+       guard(mutex)(&kvm->slots_lock);
+       /* Once TD is finalized, the initial guest memory is fixed. */
+       if (kvm_tdx->state == TD_STATE_RUNNABLE)
+               return -EINVAL;
+       if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION)
+               return -EINVAL;
+       if (copy_from_user(&region, u64_to_user_ptr(cmd->data), sizeof(region)))
+               return -EFAULT;
+       if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) ||
+           !region.nr_pages ||
+           region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa ||
+           !vt_is_tdx_private_gpa(kvm, region.gpa) ||
+           !vt_is_tdx_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1))
+               return -EINVAL;
+       kvm_mmu_reload(vcpu);
+       ret = 0;
+       while (region.nr_pages) {
+               if (signal_pending(current)) {
+                       ret = -EINTR;
+                       break;
+               }
+               arg = (struct tdx_gmem_post_populate_arg) {
+                       .vcpu = vcpu,
+                       .flags = cmd->flags,
+               };
+               gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa),
+                                            u64_to_user_ptr(region.source_addr),
+                                            1, tdx_gmem_post_populate, &arg);
+               if (gmem_ret < 0) {
+                       ret = gmem_ret;
+                       break;
+               }
+               if (gmem_ret != 1) {
+                       ret = -EIO;
+                       break;
+               }
+               region.source_addr += PAGE_SIZE;
+               region.gpa += PAGE_SIZE;
+               region.nr_pages--;
+               cond_resched();
+       }
+       if (copy_to_user(u64_to_user_ptr(cmd->data), &region, sizeof(region)))
+               ret = -EFAULT;
+       return ret;
+ }
+ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
+ {
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+       struct kvm_tdx_cmd cmd;
+       int ret;
+       if (!is_hkid_assigned(kvm_tdx) || kvm_tdx->state == TD_STATE_RUNNABLE)
+               return -EINVAL;
+       if (copy_from_user(&cmd, argp, sizeof(cmd)))
+               return -EFAULT;
+       if (cmd.hw_error)
+               return -EINVAL;
+       switch (cmd.id) {
+       case KVM_TDX_INIT_VCPU:
+               ret = tdx_vcpu_init(vcpu, &cmd);
+               break;
+       case KVM_TDX_INIT_MEM_REGION:
+               ret = tdx_vcpu_init_mem_region(vcpu, &cmd);
+               break;
+       case KVM_TDX_GET_CPUID:
+               ret = tdx_vcpu_get_cpuid(vcpu, &cmd);
+               break;
+       default:
+               ret = -EINVAL;
+               break;
+       }
+       return ret;
+ }
+ int tdx_gmem_private_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn)
+ {
+       return PG_LEVEL_4K;
+ }
+ static int tdx_online_cpu(unsigned int cpu)
+ {
+       unsigned long flags;
+       int r;
+       /* Sanity check CPU is already in post-VMXON */
+       WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE));
+       local_irq_save(flags);
+       r = tdx_cpu_enable();
+       local_irq_restore(flags);
+       return r;
+ }
+ static int tdx_offline_cpu(unsigned int cpu)
+ {
+       int i;
+       /* No TD is running.  Allow any cpu to be offline. */
+       if (!atomic_read(&nr_configured_hkid))
+               return 0;
+       /*
+        * In order to reclaim TDX HKID, (i.e. when deleting guest TD), need to
+        * call TDH.PHYMEM.PAGE.WBINVD on all packages to program all memory
+        * controller with pconfig.  If we have active TDX HKID, refuse to
+        * offline the last online cpu.
+        */
+       for_each_online_cpu(i) {
+               /*
+                * Found another online cpu on the same package.
+                * Allow to offline.
+                */
+               if (i != cpu && topology_physical_package_id(i) ==
+                               topology_physical_package_id(cpu))
+                       return 0;
+       }
+       /*
+        * This is the last cpu of this package.  Don't offline it.
+        *
+        * Because it's hard for human operator to understand the
+        * reason, warn it.
+        */
+ #define MSG_ALLPKG_ONLINE \
+       "TDX requires all packages to have an online CPU. Delete all TDs in order to offline all CPUs of a package.\n"
+       pr_warn_ratelimited(MSG_ALLPKG_ONLINE);
+       return -EBUSY;
+ }
+ static void __do_tdx_cleanup(void)
+ {
+       /*
+        * Once TDX module is initialized, it cannot be disabled and
+        * re-initialized again w/o runtime update (which isn't
+        * supported by kernel).  Only need to remove the cpuhp here.
+        * The TDX host core code tracks TDX status and can handle
+        * 'multiple enabling' scenario.
+        */
+       WARN_ON_ONCE(!tdx_cpuhp_state);
+       cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state);
+       tdx_cpuhp_state = 0;
+ }
+ static void __tdx_cleanup(void)
+ {
+       cpus_read_lock();
+       __do_tdx_cleanup();
+       cpus_read_unlock();
+ }
+ static int __init __do_tdx_bringup(void)
+ {
+       int r;
+       /*
+        * TDX-specific cpuhp callback to call tdx_cpu_enable() on all
+        * online CPUs before calling tdx_enable(), and on any new
+        * going-online CPU to make sure it is ready for TDX guest.
+        */
+       r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN,
+                                        "kvm/cpu/tdx:online",
+                                        tdx_online_cpu, tdx_offline_cpu);
+       if (r < 0)
+               return r;
+       tdx_cpuhp_state = r;
+       r = tdx_enable();
+       if (r)
+               __do_tdx_cleanup();
+       return r;
+ }
+ static int __init __tdx_bringup(void)
+ {
+       const struct tdx_sys_info_td_conf *td_conf;
+       int r, i;
+       for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) {
+               /*
+                * Check if MSRs (tdx_uret_msrs) can be saved/restored
+                * before returning to user space.
+                *
+                * this_cpu_ptr(user_return_msrs)->registered isn't checked
+                * because the registration is done at vcpu runtime by
+                * tdx_user_return_msr_update_cache().
+                */
+               tdx_uret_msrs[i].slot = kvm_find_user_return_msr(tdx_uret_msrs[i].msr);
+               if (tdx_uret_msrs[i].slot == -1) {
+                       /* If any MSR isn't supported, it is a KVM bug */
+                       pr_err("MSR %x isn't included by kvm_find_user_return_msr\n",
+                               tdx_uret_msrs[i].msr);
+                       return -EIO;
+               }
+       }
+       /*
+        * Enabling TDX requires enabling hardware virtualization first,
+        * as making SEAMCALLs requires CPU being in post-VMXON state.
+        */
+       r = kvm_enable_virtualization();
+       if (r)
+               return r;
+       cpus_read_lock();
+       r = __do_tdx_bringup();
+       cpus_read_unlock();
+       if (r)
+               goto tdx_bringup_err;
+       /* Get TDX global information for later use */
+       tdx_sysinfo = tdx_get_sysinfo();
+       if (WARN_ON_ONCE(!tdx_sysinfo)) {
+               r = -EINVAL;
+               goto get_sysinfo_err;
+       }
+       /* Check TDX module and KVM capabilities */
+       if (!tdx_get_supported_attrs(&tdx_sysinfo->td_conf) ||
+           !tdx_get_supported_xfam(&tdx_sysinfo->td_conf))
+               goto get_sysinfo_err;
+       if (!(tdx_sysinfo->features.tdx_features0 & MD_FIELD_ID_FEATURES0_TOPOLOGY_ENUM))
+               goto get_sysinfo_err;
+       /*
+        * TDX has its own limit of maximum vCPUs it can support for all
+        * TDX guests in addition to KVM_MAX_VCPUS.  Userspace needs to
+        * query TDX guest's maximum vCPUs by checking KVM_CAP_MAX_VCPU
+        * extension on per-VM basis.
+        *
+        * TDX module reports such limit via the MAX_VCPU_PER_TD global
+        * metadata.  Different modules may report different values.
+        * Some old module may also not support this metadata (in which
+        * case this limit is U16_MAX).
+        *
+        * In practice, the reported value reflects the maximum logical
+        * CPUs that ALL the platforms that the module supports can
+        * possibly have.
+        *
+        * Simply forwarding the MAX_VCPU_PER_TD to userspace could
+        * result in an unpredictable ABI.  KVM instead always advertise
+        * the number of logical CPUs the platform has as the maximum
+        * vCPUs for TDX guests.
+        *
+        * Make sure MAX_VCPU_PER_TD reported by TDX module is not
+        * smaller than the number of logical CPUs, otherwise KVM will
+        * report an unsupported value to userspace.
+        *
+        * Note, a platform with TDX enabled in the BIOS cannot support
+        * physical CPU hotplug, and TDX requires the BIOS has marked
+        * all logical CPUs in MADT table as enabled.  Just use
+        * num_present_cpus() for the number of logical CPUs.
+        */
+       td_conf = &tdx_sysinfo->td_conf;
+       if (td_conf->max_vcpus_per_td < num_present_cpus()) {
+               pr_err("Disable TDX: MAX_VCPU_PER_TD (%u) smaller than number of logical CPUs (%u).\n",
+                               td_conf->max_vcpus_per_td, num_present_cpus());
+               r = -EINVAL;
+               goto get_sysinfo_err;
+       }
+       if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) {
+               r = -EINVAL;
+               goto get_sysinfo_err;
+       }
+       /*
+        * Leave hardware virtualization enabled after TDX is enabled
+        * successfully.  TDX CPU hotplug depends on this.
+        */
+       return 0;
+ get_sysinfo_err:
+       __tdx_cleanup();
+ tdx_bringup_err:
+       kvm_disable_virtualization();
+       return r;
+ }
+ void tdx_cleanup(void)
+ {
+       if (enable_tdx) {
+               misc_cg_set_capacity(MISC_CG_RES_TDX, 0);
+               __tdx_cleanup();
+               kvm_disable_virtualization();
+       }
+ }
+ int __init tdx_bringup(void)
+ {
+       int r, i;
+       /* tdx_disable_virtualization_cpu() uses associated_tdvcpus. */
+       for_each_possible_cpu(i)
+               INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i));
+       if (!enable_tdx)
+               return 0;
+       if (!enable_ept) {
+               pr_err("EPT is required for TDX\n");
+               goto success_disable_tdx;
+       }
+       if (!tdp_mmu_enabled || !enable_mmio_caching || !enable_ept_ad_bits) {
+               pr_err("TDP MMU and MMIO caching and EPT A/D bit is required for TDX\n");
+               goto success_disable_tdx;
+       }
+       if (!enable_apicv) {
+               pr_err("APICv is required for TDX\n");
+               goto success_disable_tdx;
+       }
+       if (!cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
+               pr_err("tdx: OSXSAVE is required for TDX\n");
+               goto success_disable_tdx;
+       }
+       if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
+               pr_err("tdx: MOVDIR64B is required for TDX\n");
+               goto success_disable_tdx;
+       }
+       if (!cpu_feature_enabled(X86_FEATURE_SELFSNOOP)) {
+               pr_err("Self-snoop is required for TDX\n");
+               goto success_disable_tdx;
+       }
+       if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) {
+               pr_err("tdx: no TDX private KeyIDs available\n");
+               goto success_disable_tdx;
+       }
+       if (!enable_virt_at_load) {
+               pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n");
+               goto success_disable_tdx;
+       }
+       /*
+        * Ideally KVM should probe whether TDX module has been loaded
+        * first and then try to bring it up.  But TDX needs to use SEAMCALL
+        * to probe whether the module is loaded (there is no CPUID or MSR
+        * for that), and making SEAMCALL requires enabling virtualization
+        * first, just like the rest steps of bringing up TDX module.
+        *
+        * So, for simplicity do everything in __tdx_bringup(); the first
+        * SEAMCALL will return -ENODEV when the module is not loaded.  The
+        * only complication is having to make sure that initialization
+        * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other
+        * cases.
+        */
+       r = __tdx_bringup();
+       if (r) {
+               /*
+                * Disable TDX only but don't fail to load module if
+                * the TDX module could not be loaded.  No need to print
+                * message saying "module is not loaded" because it was
+                * printed when the first SEAMCALL failed.
+                */
+               if (r == -ENODEV)
+                       goto success_disable_tdx;
+               enable_tdx = 0;
+       }
+       return r;
+ success_disable_tdx:
+       enable_tdx = 0;
+       return 0;
+ }
index 5c5766467a61d434ba2baa79a5faba99bcbd9997,459e6d58a3279811885b6e0e5dd1cf631df7e2f1..ef2d7208dd208a4ccad3ea5a0cf4b657f7a50f11
@@@ -5902,26 -5800,7 +5827,26 @@@ static bool vmx_unhandleable_emulation_
  {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
  
-       if (!vmx->emulation_required)
 -      return vmx->vt.emulation_required && !vmx->rmode.vm86_active &&
++      if (!vmx->vt.emulation_required)
 +              return false;
 +
 +      /*
 +       * It is architecturally impossible for emulation to be required when a
 +       * nested VM-Enter is pending completion, as VM-Enter will VM-Fail if
 +       * guest state is invalid and unrestricted guest is disabled, i.e. KVM
 +       * should synthesize VM-Fail instead emulation L2 code.  This path is
 +       * only reachable if userspace modifies L2 guest state after KVM has
 +       * performed the nested VM-Enter consistency checks.
 +       */
 +      if (vmx->nested.nested_run_pending)
 +              return true;
 +
 +      /*
 +       * KVM only supports emulating exceptions if the vCPU is in Real Mode.
 +       * If emulation is required, KVM can't perform a successful VM-Enter to
 +       * inject the exception.
 +       */
 +      return !vmx->rmode.vm86_active &&
               (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected);
  }
  
index 3712dde0bf9d1fb829d58708725ad5d8136a716f,d4da2d93674580ccea254090ec20a6debde69986..844e81ee1d96edfc7bedf157cd93a815c427f63d
@@@ -11317,9 -11341,10 +11348,9 @@@ static int __kvm_emulate_halt(struct kv
         */
        ++vcpu->stat.halt_exits;
        if (lapic_in_kernel(vcpu)) {
-               if (kvm_vcpu_has_events(vcpu))
+               if (kvm_vcpu_has_events(vcpu) || vcpu->arch.pv.pv_unhalted)
 -                      vcpu->arch.pv.pv_unhalted = false;
 -              else
 -                      vcpu->arch.mp_state = state;
 +                      state = KVM_MP_STATE_RUNNABLE;
 +              kvm_set_mp_state(vcpu, state);
                return 1;
        } else {
                vcpu->run->exit_reason = reason;
@@@ -12861,18 -12906,8 +12894,19 @@@ EXPORT_SYMBOL_GPL(__x86_set_memory_regi
  
  void kvm_arch_pre_destroy_vm(struct kvm *kvm)
  {
 +      /*
 +       * Stop all background workers and kthreads before destroying vCPUs, as
 +       * iterating over vCPUs in a different task while vCPUs are being freed
 +       * is unsafe, i.e. will lead to use-after-free.  The PIT also needs to
 +       * be stopped before IRQ routing is freed.
 +       */
 +      cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
 +      cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
 +
 +      kvm_free_pit(kvm);
 +
        kvm_mmu_pre_destroy_vm(kvm);
+       static_call_cond(kvm_x86_vm_pre_destroy)(kvm);
  }
  
  void kvm_arch_destroy_vm(struct kvm *kvm)
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge
Simple merge