]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
KVM: TDX: create/destroy VM structure
authorIsaku Yamahata <isaku.yamahata@intel.com>
Tue, 25 Feb 2025 17:45:13 +0000 (12:45 -0500)
committerPaolo Bonzini <pbonzini@redhat.com>
Fri, 14 Mar 2025 18:20:50 +0000 (14:20 -0400)
Implement managing the TDX private KeyID to implement, create, destroy
and free for a TDX guest.

When creating at TDX guest, assign a TDX private KeyID for the TDX guest
for memory encryption, and allocate pages for the guest. These are used
for the Trust Domain Root (TDR) and Trust Domain Control Structure (TDCS).

On destruction, free the allocated pages, and the KeyID.

Before tearing down the private page tables, TDX requires the guest TD to
be destroyed by reclaiming the KeyID. Do it in the vm_pre_destroy() kvm_x86_ops
hook. The TDR control structures can be freed in the vm_destroy() hook,
which runs last.

Co-developed-by: Tony Lindgren <tony.lindgren@linux.intel.com>
Signed-off-by: Tony Lindgren <tony.lindgren@linux.intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Co-developed-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Co-developed-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Kai Huang <kai.huang@intel.com>
Co-developed-by: Yan Zhao <yan.y.zhao@intel.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
Co-developed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
 - Fix build issue in kvm-coco-queue
 - Init ret earlier to fix __tdx_td_init() error handling. (Chao)
 - Standardize -EAGAIN for __tdx_td_init() retry errors (Rick)
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
arch/x86/include/asm/kvm-x86-ops.h
arch/x86/include/asm/kvm_host.h
arch/x86/kvm/Kconfig
arch/x86/kvm/vmx/main.c
arch/x86/kvm/vmx/tdx.c
arch/x86/kvm/vmx/tdx.h
arch/x86/kvm/vmx/x86_ops.h
arch/x86/kvm/x86.c

index 1eca04087cf4f4ddaddc6bd29e444ed15fb6ef31..e6cb89ced1fd792e0c5d6cb057ea0036b396555d 100644 (file)
@@ -21,6 +21,7 @@ KVM_X86_OP(has_emulated_msr)
 KVM_X86_OP(vcpu_after_set_cpuid)
 KVM_X86_OP(vm_init)
 KVM_X86_OP_OPTIONAL(vm_destroy)
+KVM_X86_OP_OPTIONAL(vm_pre_destroy)
 KVM_X86_OP_OPTIONAL_RET0(vcpu_precreate)
 KVM_X86_OP(vcpu_create)
 KVM_X86_OP(vcpu_free)
index ee55d1f753e8e9b14d934c6c2f0f0409515f9282..405d3892427923e082687756048e82098439136e 100644 (file)
@@ -1665,6 +1665,7 @@ struct kvm_x86_ops {
        unsigned int vm_size;
        int (*vm_init)(struct kvm *kvm);
        void (*vm_destroy)(struct kvm *kvm);
+       void (*vm_pre_destroy)(struct kvm *kvm);
 
        /* Create, but do not attach this VCPU */
        int (*vcpu_precreate)(struct kvm *kvm);
index fe8cbee6f6143a77ba18b9e1d6e76443ac2b95ee..0d445a317f61fe59f1376c52bee9c9caacbe678a 100644 (file)
@@ -94,6 +94,8 @@ config KVM_SW_PROTECTED_VM
 config KVM_INTEL
        tristate "KVM for Intel (and compatible) processors support"
        depends on KVM && IA32_FEAT_CTL
+       select KVM_GENERIC_PRIVATE_MEM if INTEL_TDX_HOST
+       select KVM_GENERIC_MEMORY_ATTRIBUTES if INTEL_TDX_HOST
        help
          Provides support for KVM on processors equipped with Intel's VT
          extensions, a.k.a. Virtual Machine Extensions (VMX).
index 69c65085f81acaaf24dcfd0542e5404303d40991..97737948774ae2156ee6540c3e44bd01b0801491 100644 (file)
@@ -41,6 +41,28 @@ static __init int vt_hardware_setup(void)
        return 0;
 }
 
+static int vt_vm_init(struct kvm *kvm)
+{
+       if (is_td(kvm))
+               return tdx_vm_init(kvm);
+
+       return vmx_vm_init(kvm);
+}
+
+static void vt_vm_pre_destroy(struct kvm *kvm)
+{
+       if (is_td(kvm))
+               return tdx_mmu_release_hkid(kvm);
+}
+
+static void vt_vm_destroy(struct kvm *kvm)
+{
+       if (is_td(kvm))
+               return tdx_vm_destroy(kvm);
+
+       vmx_vm_destroy(kvm);
+}
+
 static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
 {
        if (!is_td(kvm))
@@ -72,8 +94,10 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
        .has_emulated_msr = vmx_has_emulated_msr,
 
        .vm_size = sizeof(struct kvm_vmx),
-       .vm_init = vmx_vm_init,
-       .vm_destroy = vmx_vm_destroy,
+
+       .vm_init = vt_vm_init,
+       .vm_pre_destroy = vt_vm_pre_destroy,
+       .vm_destroy = vt_vm_destroy,
 
        .vcpu_precreate = vmx_vcpu_precreate,
        .vcpu_create = vmx_vcpu_create,
index 3e7dc251e5e0b6814e5c9cc0e552281c2955ad40..57eab6fc9f89f9e5ada4e05e36d5a8272502d073 100644 (file)
@@ -120,6 +120,266 @@ static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
        return 0;
 }
 
+/*
+ * Some SEAMCALLs acquire the TDX module globally, and can fail with
+ * TDX_OPERAND_BUSY.  Use a global mutex to serialize these SEAMCALLs.
+ */
+static DEFINE_MUTEX(tdx_lock);
+
+static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx)
+{
+       tdx_guest_keyid_free(kvm_tdx->hkid);
+       kvm_tdx->hkid = -1;
+}
+
+static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
+{
+       return kvm_tdx->hkid > 0;
+}
+
+static void tdx_clear_page(struct page *page)
+{
+       const void *zero_page = (const void *) page_to_virt(ZERO_PAGE(0));
+       void *dest = page_to_virt(page);
+       unsigned long i;
+
+       /*
+        * The page could have been poisoned.  MOVDIR64B also clears
+        * the poison bit so the kernel can safely use the page again.
+        */
+       for (i = 0; i < PAGE_SIZE; i += 64)
+               movdir64b(dest + i, zero_page);
+       /*
+        * MOVDIR64B store uses WC buffer.  Prevent following memory reads
+        * from seeing potentially poisoned cache.
+        */
+       __mb();
+}
+
+/* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */
+static int __tdx_reclaim_page(struct page *page)
+{
+       u64 err, rcx, rdx, r8;
+
+       err = tdh_phymem_page_reclaim(page, &rcx, &rdx, &r8);
+
+       /*
+        * No need to check for TDX_OPERAND_BUSY; all TD pages are freed
+        * before the HKID is released and control pages have also been
+        * released at this point, so there is no possibility of contention.
+        */
+       if (WARN_ON_ONCE(err)) {
+               pr_tdx_error_3(TDH_PHYMEM_PAGE_RECLAIM, err, rcx, rdx, r8);
+               return -EIO;
+       }
+       return 0;
+}
+
+static int tdx_reclaim_page(struct page *page)
+{
+       int r;
+
+       r = __tdx_reclaim_page(page);
+       if (!r)
+               tdx_clear_page(page);
+       return r;
+}
+
+
+/*
+ * Reclaim the TD control page(s) which are crypto-protected by TDX guest's
+ * private KeyID.  Assume the cache associated with the TDX private KeyID has
+ * been flushed.
+ */
+static void tdx_reclaim_control_page(struct page *ctrl_page)
+{
+       /*
+        * Leak the page if the kernel failed to reclaim the page.
+        * The kernel cannot use it safely anymore.
+        */
+       if (tdx_reclaim_page(ctrl_page))
+               return;
+
+       __free_page(ctrl_page);
+}
+
+#define TDX_SEAMCALL_RETRIES 10000
+
+static void smp_func_do_phymem_cache_wb(void *unused)
+{
+       u64 err = 0;
+       bool resume;
+       int i;
+
+       /*
+        * TDH.PHYMEM.CACHE.WB flushes caches associated with any TDX private
+        * KeyID on the package or core.  The TDX module may not finish the
+        * cache flush but return TDX_INTERRUPTED_RESUMEABLE instead.  The
+        * kernel should retry it until it returns success w/o rescheduling.
+        */
+       for (i = TDX_SEAMCALL_RETRIES; i > 0; i--) {
+               resume = !!err;
+               err = tdh_phymem_cache_wb(resume);
+               switch (err) {
+               case TDX_INTERRUPTED_RESUMABLE:
+                       continue;
+               case TDX_NO_HKID_READY_TO_WBCACHE:
+                       err = TDX_SUCCESS; /* Already done by other thread */
+                       fallthrough;
+               default:
+                       goto out;
+               }
+       }
+
+out:
+       if (WARN_ON_ONCE(err))
+               pr_tdx_error(TDH_PHYMEM_CACHE_WB, err);
+}
+
+void tdx_mmu_release_hkid(struct kvm *kvm)
+{
+       bool packages_allocated, targets_allocated;
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+       cpumask_var_t packages, targets;
+       u64 err;
+       int i;
+
+       if (!is_hkid_assigned(kvm_tdx))
+               return;
+
+       /* KeyID has been allocated but guest is not yet configured */
+       if (!kvm_tdx->td.tdr_page) {
+               tdx_hkid_free(kvm_tdx);
+               return;
+       }
+
+       packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
+       targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
+       cpus_read_lock();
+
+       /*
+        * TDH.PHYMEM.CACHE.WB tries to acquire the TDX module global lock
+        * and can fail with TDX_OPERAND_BUSY when it fails to get the lock.
+        * Multiple TDX guests can be destroyed simultaneously. Take the
+        * mutex to prevent it from getting error.
+        */
+       mutex_lock(&tdx_lock);
+
+       /*
+        * Releasing HKID is in vm_destroy().
+        * After the above flushing vps, there should be no more vCPU
+        * associations, as all vCPU fds have been released at this stage.
+        */
+       for_each_online_cpu(i) {
+               if (packages_allocated &&
+                   cpumask_test_and_set_cpu(topology_physical_package_id(i),
+                                            packages))
+                       continue;
+               if (targets_allocated)
+                       cpumask_set_cpu(i, targets);
+       }
+       if (targets_allocated)
+               on_each_cpu_mask(targets, smp_func_do_phymem_cache_wb, NULL, true);
+       else
+               on_each_cpu(smp_func_do_phymem_cache_wb, NULL, true);
+       /*
+        * In the case of error in smp_func_do_phymem_cache_wb(), the following
+        * tdh_mng_key_freeid() will fail.
+        */
+       err = tdh_mng_key_freeid(&kvm_tdx->td);
+       if (KVM_BUG_ON(err, kvm)) {
+               pr_tdx_error(TDH_MNG_KEY_FREEID, err);
+               pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
+                      kvm_tdx->hkid);
+       } else {
+               tdx_hkid_free(kvm_tdx);
+       }
+
+       mutex_unlock(&tdx_lock);
+       cpus_read_unlock();
+       free_cpumask_var(targets);
+       free_cpumask_var(packages);
+}
+
+static void tdx_reclaim_td_control_pages(struct kvm *kvm)
+{
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+       u64 err;
+       int i;
+
+       /*
+        * tdx_mmu_release_hkid() failed to reclaim HKID.  Something went wrong
+        * heavily with TDX module.  Give up freeing TD pages.  As the function
+        * already warned, don't warn it again.
+        */
+       if (is_hkid_assigned(kvm_tdx))
+               return;
+
+       if (kvm_tdx->td.tdcs_pages) {
+               for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
+                       if (!kvm_tdx->td.tdcs_pages[i])
+                               continue;
+
+                       tdx_reclaim_control_page(kvm_tdx->td.tdcs_pages[i]);
+               }
+               kfree(kvm_tdx->td.tdcs_pages);
+               kvm_tdx->td.tdcs_pages = NULL;
+       }
+
+       if (!kvm_tdx->td.tdr_page)
+               return;
+
+       if (__tdx_reclaim_page(kvm_tdx->td.tdr_page))
+               return;
+
+       /*
+        * Use a SEAMCALL to ask the TDX module to flush the cache based on the
+        * KeyID. TDX module may access TDR while operating on TD (Especially
+        * when it is reclaiming TDCS).
+        */
+       err = tdh_phymem_page_wbinvd_tdr(&kvm_tdx->td);
+       if (KVM_BUG_ON(err, kvm)) {
+               pr_tdx_error(TDH_PHYMEM_PAGE_WBINVD, err);
+               return;
+       }
+       tdx_clear_page(kvm_tdx->td.tdr_page);
+
+       __free_page(kvm_tdx->td.tdr_page);
+       kvm_tdx->td.tdr_page = NULL;
+}
+
+void tdx_vm_destroy(struct kvm *kvm)
+{
+       tdx_reclaim_td_control_pages(kvm);
+}
+
+static int tdx_do_tdh_mng_key_config(void *param)
+{
+       struct kvm_tdx *kvm_tdx = param;
+       u64 err;
+
+       /* TDX_RND_NO_ENTROPY related retries are handled by sc_retry() */
+       err = tdh_mng_key_config(&kvm_tdx->td);
+
+       if (KVM_BUG_ON(err, &kvm_tdx->kvm)) {
+               pr_tdx_error(TDH_MNG_KEY_CONFIG, err);
+               return -EIO;
+       }
+
+       return 0;
+}
+
+static int __tdx_td_init(struct kvm *kvm);
+
+int tdx_vm_init(struct kvm *kvm)
+{
+       kvm->arch.has_protected_state = true;
+       kvm->arch.has_private_mem = true;
+
+       /* Place holder for TDX specific logic. */
+       return __tdx_td_init(kvm);
+}
+
 static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
 {
        const struct tdx_sys_info_td_conf *td_conf = &tdx_sysinfo->td_conf;
@@ -168,6 +428,177 @@ out:
        return ret;
 }
 
+static int __tdx_td_init(struct kvm *kvm)
+{
+       struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+       cpumask_var_t packages;
+       struct page **tdcs_pages = NULL;
+       struct page *tdr_page;
+       int ret, i;
+       u64 err;
+
+       ret = tdx_guest_keyid_alloc();
+       if (ret < 0)
+               return ret;
+       kvm_tdx->hkid = ret;
+
+       ret = -ENOMEM;
+
+       tdr_page = alloc_page(GFP_KERNEL);
+       if (!tdr_page)
+               goto free_hkid;
+
+       kvm_tdx->td.tdcs_nr_pages = tdx_sysinfo->td_ctrl.tdcs_base_size / PAGE_SIZE;
+       tdcs_pages = kcalloc(kvm_tdx->td.tdcs_nr_pages, sizeof(*kvm_tdx->td.tdcs_pages),
+                            GFP_KERNEL | __GFP_ZERO);
+       if (!tdcs_pages)
+               goto free_tdr;
+
+       for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
+               tdcs_pages[i] = alloc_page(GFP_KERNEL);
+               if (!tdcs_pages[i])
+                       goto free_tdcs;
+       }
+
+       if (!zalloc_cpumask_var(&packages, GFP_KERNEL))
+               goto free_tdcs;
+
+       cpus_read_lock();
+
+       /*
+        * Need at least one CPU of the package to be online in order to
+        * program all packages for host key id.  Check it.
+        */
+       for_each_present_cpu(i)
+               cpumask_set_cpu(topology_physical_package_id(i), packages);
+       for_each_online_cpu(i)
+               cpumask_clear_cpu(topology_physical_package_id(i), packages);
+       if (!cpumask_empty(packages)) {
+               ret = -EIO;
+               /*
+                * Because it's hard for human operator to figure out the
+                * reason, warn it.
+                */
+#define MSG_ALLPKG     "All packages need to have online CPU to create TD. Online CPU and retry.\n"
+               pr_warn_ratelimited(MSG_ALLPKG);
+               goto free_packages;
+       }
+
+       /*
+        * TDH.MNG.CREATE tries to grab the global TDX module and fails
+        * with TDX_OPERAND_BUSY when it fails to grab.  Take the global
+        * lock to prevent it from failure.
+        */
+       mutex_lock(&tdx_lock);
+       kvm_tdx->td.tdr_page = tdr_page;
+       err = tdh_mng_create(&kvm_tdx->td, kvm_tdx->hkid);
+       mutex_unlock(&tdx_lock);
+
+       if (err == TDX_RND_NO_ENTROPY) {
+               ret = -EAGAIN;
+               goto free_packages;
+       }
+
+       if (WARN_ON_ONCE(err)) {
+               pr_tdx_error(TDH_MNG_CREATE, err);
+               ret = -EIO;
+               goto free_packages;
+       }
+
+       for_each_online_cpu(i) {
+               int pkg = topology_physical_package_id(i);
+
+               if (cpumask_test_and_set_cpu(pkg, packages))
+                       continue;
+
+               /*
+                * Program the memory controller in the package with an
+                * encryption key associated to a TDX private host key id
+                * assigned to this TDR.  Concurrent operations on same memory
+                * controller results in TDX_OPERAND_BUSY. No locking needed
+                * beyond the cpus_read_lock() above as it serializes against
+                * hotplug and the first online CPU of the package is always
+                * used. We never have two CPUs in the same socket trying to
+                * program the key.
+                */
+               ret = smp_call_on_cpu(i, tdx_do_tdh_mng_key_config,
+                                     kvm_tdx, true);
+               if (ret)
+                       break;
+       }
+       cpus_read_unlock();
+       free_cpumask_var(packages);
+       if (ret) {
+               i = 0;
+               goto teardown;
+       }
+
+       kvm_tdx->td.tdcs_pages = tdcs_pages;
+       for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
+               err = tdh_mng_addcx(&kvm_tdx->td, tdcs_pages[i]);
+               if (err == TDX_RND_NO_ENTROPY) {
+                       /* Here it's hard to allow userspace to retry. */
+                       ret = -EAGAIN;
+                       goto teardown;
+               }
+               if (WARN_ON_ONCE(err)) {
+                       pr_tdx_error(TDH_MNG_ADDCX, err);
+                       ret = -EIO;
+                       goto teardown;
+               }
+       }
+
+       /*
+        * Note, TDH_MNG_INIT cannot be invoked here.  TDH_MNG_INIT requires a dedicated
+        * ioctl() to define the configure CPUID values for the TD.
+        */
+       return 0;
+
+       /*
+        * The sequence for freeing resources from a partially initialized TD
+        * varies based on where in the initialization flow failure occurred.
+        * Simply use the full teardown and destroy, which naturally play nice
+        * with partial initialization.
+        */
+teardown:
+       /* Only free pages not yet added, so start at 'i' */
+       for (; i < kvm_tdx->td.tdcs_nr_pages; i++) {
+               if (tdcs_pages[i]) {
+                       __free_page(tdcs_pages[i]);
+                       tdcs_pages[i] = NULL;
+               }
+       }
+       if (!kvm_tdx->td.tdcs_pages)
+               kfree(tdcs_pages);
+
+       tdx_mmu_release_hkid(kvm);
+       tdx_reclaim_td_control_pages(kvm);
+
+       return ret;
+
+free_packages:
+       cpus_read_unlock();
+       free_cpumask_var(packages);
+
+free_tdcs:
+       for (i = 0; i < kvm_tdx->td.tdcs_nr_pages; i++) {
+               if (tdcs_pages[i])
+                       __free_page(tdcs_pages[i]);
+       }
+       kfree(tdcs_pages);
+       kvm_tdx->td.tdcs_pages = NULL;
+
+free_tdr:
+       if (tdr_page)
+               __free_page(tdr_page);
+       kvm_tdx->td.tdr_page = 0;
+
+free_hkid:
+       tdx_hkid_free(kvm_tdx);
+
+       return ret;
+}
+
 int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
 {
        struct kvm_tdx_cmd tdx_cmd;
@@ -324,6 +755,11 @@ int __init tdx_bringup(void)
        if (!enable_tdx)
                return 0;
 
+       if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
+               pr_err("tdx: MOVDIR64B is required for TDX\n");
+               goto success_disable_tdx;
+       }
+
        if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) {
                pr_err("tdx: no TDX private KeyIDs available\n");
                goto success_disable_tdx;
index f1e861cbee3bafa789ed2bc411b94064101889c1..afa4e2242311a2fc9e831060f125bf6672114bf3 100644 (file)
@@ -13,7 +13,9 @@ extern bool enable_tdx;
 
 struct kvm_tdx {
        struct kvm kvm;
-       /* TDX specific members follow. */
+       int hkid;
+
+       struct tdx_td td;
 };
 
 struct vcpu_tdx {
index 75e0ca29a39e0b727f208373260f273dbea34fb7..85c78639d476f2fcd01b70d8f7286e58b0cce733 100644 (file)
@@ -122,8 +122,14 @@ void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu);
 void vmx_setup_mce(struct kvm_vcpu *vcpu);
 
 #ifdef CONFIG_KVM_INTEL_TDX
+int tdx_vm_init(struct kvm *kvm);
+void tdx_mmu_release_hkid(struct kvm *kvm);
+void tdx_vm_destroy(struct kvm *kvm);
 int tdx_vm_ioctl(struct kvm *kvm, void __user *argp);
 #else
+static inline int tdx_vm_init(struct kvm *kvm) { return -EOPNOTSUPP; }
+static inline void tdx_mmu_release_hkid(struct kvm *kvm) {}
+static inline void tdx_vm_destroy(struct kvm *kvm) {}
 static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOPNOTSUPP; }
 #endif
 
index 2434da7ac973721e22d0d1393fb0aa18eb1f741d..2a2d2b6c283e90f79c16572d665ba334b3d27ccf 100644 (file)
@@ -12870,6 +12870,7 @@ EXPORT_SYMBOL_GPL(__x86_set_memory_region);
 void kvm_arch_pre_destroy_vm(struct kvm *kvm)
 {
        kvm_mmu_pre_destroy_vm(kvm);
+       static_call_cond(kvm_x86_vm_pre_destroy)(kvm);
 }
 
 void kvm_arch_destroy_vm(struct kvm *kvm)