]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
x86/tdx: Convert shared memory back to private on kexec
authorKirill A. Shutemov <kirill.shutemov@linux.intel.com>
Fri, 14 Jun 2024 09:58:56 +0000 (12:58 +0300)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 4 Oct 2024 14:33:44 +0000 (16:33 +0200)
[ Upstream commit 859e63b789d6b17b3c64e51a0aabdc58752a0254 ]

TDX guests allocate shared buffers to perform I/O. It is done by allocating
pages normally from the buddy allocator and converting them to shared with
set_memory_decrypted().

The second, kexec-ed kernel has no idea what memory is converted this way. It
only sees E820_TYPE_RAM.

Accessing shared memory via private mapping is fatal. It leads to unrecoverable
TD exit.

On kexec, walk direct mapping and convert all shared memory back to private. It
makes all RAM private again and second kernel may use it normally.

The conversion occurs in two steps: stopping new conversions and unsharing all
memory. In the case of normal kexec, the stopping of conversions takes place
while scheduling is still functioning. This allows for waiting until any ongoing
conversions are finished. The second step is carried out when all CPUs except one
are inactive and interrupts are disabled. This prevents any conflicts with code
that may access shared memory.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Reviewed-by: Kai Huang <kai.huang@intel.com>
Tested-by: Tao Liu <ltao@redhat.com>
Link: https://lore.kernel.org/r/20240614095904.1345461-12-kirill.shutemov@linux.intel.com
Stable-dep-of: d4fc4d014715 ("x86/tdx: Fix "in-kernel MMIO" check")
Signed-off-by: Sasha Levin <sashal@kernel.org>
arch/x86/coco/tdx/tdx.c
arch/x86/include/asm/pgtable.h
arch/x86/include/asm/set_memory.h
arch/x86/mm/pat/set_memory.c

index 729ef77b65865e2755f899caa51e3d5d6b7cefbe..da8b66dce0da5f614f217c3d5d9bbe978e458ceb 100644 (file)
@@ -7,6 +7,7 @@
 #include <linux/cpufeature.h>
 #include <linux/export.h>
 #include <linux/io.h>
+#include <linux/kexec.h>
 #include <asm/coco.h>
 #include <asm/tdx.h>
 #include <asm/vmx.h>
@@ -14,6 +15,7 @@
 #include <asm/insn.h>
 #include <asm/insn-eval.h>
 #include <asm/pgtable.h>
+#include <asm/set_memory.h>
 
 /* MMIO direction */
 #define EPT_READ       0
@@ -830,6 +832,95 @@ static int tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
        return 0;
 }
 
+/* Stop new private<->shared conversions */
+static void tdx_kexec_begin(void)
+{
+       if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+               return;
+
+       /*
+        * Crash kernel reaches here with interrupts disabled: can't wait for
+        * conversions to finish.
+        *
+        * If race happened, just report and proceed.
+        */
+       if (!set_memory_enc_stop_conversion())
+               pr_warn("Failed to stop shared<->private conversions\n");
+}
+
+/* Walk direct mapping and convert all shared memory back to private */
+static void tdx_kexec_finish(void)
+{
+       unsigned long addr, end;
+       long found = 0, shared;
+
+       if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+               return;
+
+       lockdep_assert_irqs_disabled();
+
+       addr = PAGE_OFFSET;
+       end  = PAGE_OFFSET + get_max_mapped();
+
+       while (addr < end) {
+               unsigned long size;
+               unsigned int level;
+               pte_t *pte;
+
+               pte = lookup_address(addr, &level);
+               size = page_level_size(level);
+
+               if (pte && pte_decrypted(*pte)) {
+                       int pages = size / PAGE_SIZE;
+
+                       /*
+                        * Touching memory with shared bit set triggers implicit
+                        * conversion to shared.
+                        *
+                        * Make sure nobody touches the shared range from
+                        * now on.
+                        */
+                       set_pte(pte, __pte(0));
+
+                       /*
+                        * Memory encryption state persists across kexec.
+                        * If tdx_enc_status_changed() fails in the first
+                        * kernel, it leaves memory in an unknown state.
+                        *
+                        * If that memory remains shared, accessing it in the
+                        * *next* kernel through a private mapping will result
+                        * in an unrecoverable guest shutdown.
+                        *
+                        * The kdump kernel boot is not impacted as it uses
+                        * a pre-reserved memory range that is always private.
+                        * However, gathering crash information could lead to
+                        * a crash if it accesses unconverted memory through
+                        * a private mapping which is possible when accessing
+                        * that memory through /proc/vmcore, for example.
+                        *
+                        * In all cases, print error info in order to leave
+                        * enough bread crumbs for debugging.
+                        */
+                       if (!tdx_enc_status_changed(addr, pages, true)) {
+                               pr_err("Failed to unshare range %#lx-%#lx\n",
+                                      addr, addr + size);
+                       }
+
+                       found += pages;
+               }
+
+               addr += size;
+       }
+
+       __flush_tlb_all();
+
+       shared = atomic_long_read(&nr_shared);
+       if (shared != found) {
+               pr_err("shared page accounting is off\n");
+               pr_err("nr_shared = %ld, nr_found = %ld\n", shared, found);
+       }
+}
+
 void __init tdx_early_init(void)
 {
        struct tdx_module_args args = {
@@ -889,6 +980,9 @@ void __init tdx_early_init(void)
        x86_platform.guest.enc_cache_flush_required  = tdx_cache_flush_required;
        x86_platform.guest.enc_tlb_flush_required    = tdx_tlb_flush_required;
 
+       x86_platform.guest.enc_kexec_begin           = tdx_kexec_begin;
+       x86_platform.guest.enc_kexec_finish          = tdx_kexec_finish;
+
        /*
         * TDX intercepts the RDMSR to read the X2APIC ID in the parallel
         * bringup low level code. That raises #VE which cannot be handled
index 65b8e5bb902cc3b3d217bfa754914abd987d72ac..e39311a89bf478ee17cee49ddee148d56921676c 100644 (file)
@@ -140,6 +140,11 @@ static inline int pte_young(pte_t pte)
        return pte_flags(pte) & _PAGE_ACCESSED;
 }
 
+static inline bool pte_decrypted(pte_t pte)
+{
+       return cc_mkdec(pte_val(pte)) == pte_val(pte);
+}
+
 #define pmd_dirty pmd_dirty
 static inline bool pmd_dirty(pmd_t pmd)
 {
index 9aee31862b4a8b8cbf2242db991a5cbeb3d41e21..4b2abce2e3e7d6b215fe5847ee68754398090bfa 100644 (file)
@@ -49,8 +49,11 @@ int set_memory_wb(unsigned long addr, int numpages);
 int set_memory_np(unsigned long addr, int numpages);
 int set_memory_p(unsigned long addr, int numpages);
 int set_memory_4k(unsigned long addr, int numpages);
+
+bool set_memory_enc_stop_conversion(void);
 int set_memory_encrypted(unsigned long addr, int numpages);
 int set_memory_decrypted(unsigned long addr, int numpages);
+
 int set_memory_np_noalias(unsigned long addr, int numpages);
 int set_memory_nonglobal(unsigned long addr, int numpages);
 int set_memory_global(unsigned long addr, int numpages);
index 498812f067cd597e3c75871be1e4f148c70daa43..1356e25e6d12548d7246b1cb0e9babc1d848b823 100644 (file)
@@ -2228,12 +2228,48 @@ vmm_fail:
        return ret;
 }
 
+/*
+ * The lock serializes conversions between private and shared memory.
+ *
+ * It is taken for read on conversion. A write lock guarantees that no
+ * concurrent conversions are in progress.
+ */
+static DECLARE_RWSEM(mem_enc_lock);
+
+/*
+ * Stop new private<->shared conversions.
+ *
+ * Taking the exclusive mem_enc_lock waits for in-flight conversions to complete.
+ * The lock is not released to prevent new conversions from being started.
+ */
+bool set_memory_enc_stop_conversion(void)
+{
+       /*
+        * In a crash scenario, sleep is not allowed. Try to take the lock.
+        * Failure indicates that there is a race with the conversion.
+        */
+       if (oops_in_progress)
+               return down_write_trylock(&mem_enc_lock);
+
+       down_write(&mem_enc_lock);
+
+       return true;
+}
+
 static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
 {
-       if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
-               return __set_memory_enc_pgtable(addr, numpages, enc);
+       int ret = 0;
 
-       return 0;
+       if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) {
+               if (!down_read_trylock(&mem_enc_lock))
+                       return -EBUSY;
+
+               ret = __set_memory_enc_pgtable(addr, numpages, enc);
+
+               up_read(&mem_enc_lock);
+       }
+
+       return ret;
 }
 
 int set_memory_encrypted(unsigned long addr, int numpages)