]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
kexec: add KHO parsing support
authorAlexander Graf <graf@amazon.com>
Fri, 9 May 2025 07:46:23 +0000 (00:46 -0700)
committerAndrew Morton <akpm@linux-foundation.org>
Tue, 13 May 2025 06:50:39 +0000 (23:50 -0700)
When we have a KHO kexec, we get an FDT blob and scratch region to
populate the state of the system.  Provide helper functions that allow
architecture code to easily handle memory reservations based on them and
give device drivers visibility into the KHO FDT and memory reservations so
they can recover their own state.

Include a fix from Arnd Bergmann <arnd@arndb.de>
https://lore.kernel.org/lkml/20250424093302.3894961-1-arnd@kernel.org/.

Link: https://lkml.kernel.org/r/20250509074635.3187114-6-changyuanl@google.com
Signed-off-by: Alexander Graf <graf@amazon.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Co-developed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Co-developed-by: Changyuan Lyu <changyuanl@google.com>
Signed-off-by: Changyuan Lyu <changyuanl@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anthony Yznaga <anthony.yznaga@oracle.com>
Cc: Ashish Kalra <ashish.kalra@amd.com>
Cc: Ben Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Betkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Gowans <jgowans@amazon.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Krzysztof Kozlowski <krzk@kernel.org>
Cc: Marc Rutland <mark.rutland@arm.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Pratyush Yadav <ptyadav@amazon.de>
Cc: Rob Herring <robh@kernel.org>
Cc: Saravana Kannan <saravanak@google.com>
Cc: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Thomas Lendacky <thomas.lendacky@amd.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
include/linux/kexec_handover.h
kernel/kexec_handover.c
mm/memblock.c

index 2e19004776f6bd7f6328b065a4657432a9a10efc..02dcfc8c427e35c39907cea97f1088b1c87bd9eb 100644 (file)
@@ -24,11 +24,15 @@ struct kho_serialization;
 bool kho_is_enabled(void);
 
 int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt);
+int kho_retrieve_subtree(const char *name, phys_addr_t *phys);
 
 int register_kho_notifier(struct notifier_block *nb);
 int unregister_kho_notifier(struct notifier_block *nb);
 
 void kho_memory_init(void);
+
+void kho_populate(phys_addr_t fdt_phys, u64 fdt_len, phys_addr_t scratch_phys,
+                 u64 scratch_len);
 #else
 static inline bool kho_is_enabled(void)
 {
@@ -41,6 +45,11 @@ static inline int kho_add_subtree(struct kho_serialization *ser,
        return -EOPNOTSUPP;
 }
 
+static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
+{
+       return -EOPNOTSUPP;
+}
+
 static inline int register_kho_notifier(struct notifier_block *nb)
 {
        return -EOPNOTSUPP;
@@ -54,6 +63,11 @@ static inline int unregister_kho_notifier(struct notifier_block *nb)
 static inline void kho_memory_init(void)
 {
 }
+
+static inline void kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
+                               phys_addr_t scratch_phys, u64 scratch_len)
+{
+}
 #endif /* CONFIG_KEXEC_HANDOVER */
 
 #endif /* LINUX_KEXEC_HANDOVER_H */
index e541d3d5003d1c66be28659c67781f8f543fe14d..59f3cf9557f5083a8ccf3d63b5cc65fdf4cfb387 100644 (file)
@@ -17,6 +17,9 @@
 #include <linux/memblock.h>
 #include <linux/notifier.h>
 #include <linux/page-isolation.h>
+
+#include <asm/early_ioremap.h>
+
 /*
  * KHO is tightly coupled with mm init and needs access to some of mm
  * internal APIs.
@@ -501,9 +504,112 @@ err_rmdir:
        return -ENOENT;
 }
 
+struct kho_in {
+       struct dentry *dir;
+       phys_addr_t fdt_phys;
+       phys_addr_t scratch_phys;
+       struct list_head fdt_list;
+};
+
+static struct kho_in kho_in = {
+       .fdt_list = LIST_HEAD_INIT(kho_in.fdt_list),
+};
+
+static const void *kho_get_fdt(void)
+{
+       return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL;
+}
+
+/**
+ * kho_retrieve_subtree - retrieve a preserved sub FDT by its name.
+ * @name: the name of the sub FDT passed to kho_add_subtree().
+ * @phys: if found, the physical address of the sub FDT is stored in @phys.
+ *
+ * Retrieve a preserved sub FDT named @name and store its physical
+ * address in @phys.
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_retrieve_subtree(const char *name, phys_addr_t *phys)
+{
+       const void *fdt = kho_get_fdt();
+       const u64 *val;
+       int offset, len;
+
+       if (!fdt)
+               return -ENOENT;
+
+       if (!phys)
+               return -EINVAL;
+
+       offset = fdt_subnode_offset(fdt, 0, name);
+       if (offset < 0)
+               return -ENOENT;
+
+       val = fdt_getprop(fdt, offset, PROP_SUB_FDT, &len);
+       if (!val || len != sizeof(*val))
+               return -EINVAL;
+
+       *phys = (phys_addr_t)*val;
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kho_retrieve_subtree);
+
+/* Handling for debugfs/kho/in */
+
+static __init int kho_in_debugfs_init(const void *fdt)
+{
+       struct dentry *sub_fdt_dir;
+       int err, child;
+
+       kho_in.dir = debugfs_create_dir("in", debugfs_root);
+       if (IS_ERR(kho_in.dir))
+               return PTR_ERR(kho_in.dir);
+
+       sub_fdt_dir = debugfs_create_dir("sub_fdts", kho_in.dir);
+       if (IS_ERR(sub_fdt_dir)) {
+               err = PTR_ERR(sub_fdt_dir);
+               goto err_rmdir;
+       }
+
+       err = kho_debugfs_fdt_add(&kho_in.fdt_list, kho_in.dir, "fdt", fdt);
+       if (err)
+               goto err_rmdir;
+
+       fdt_for_each_subnode(child, fdt, 0) {
+               int len = 0;
+               const char *name = fdt_get_name(fdt, child, NULL);
+               const u64 *fdt_phys;
+
+               fdt_phys = fdt_getprop(fdt, child, "fdt", &len);
+               if (!fdt_phys)
+                       continue;
+               if (len != sizeof(*fdt_phys)) {
+                       pr_warn("node `%s`'s prop `fdt` has invalid length: %d\n",
+                               name, len);
+                       continue;
+               }
+               err = kho_debugfs_fdt_add(&kho_in.fdt_list, sub_fdt_dir, name,
+                                         phys_to_virt(*fdt_phys));
+               if (err) {
+                       pr_warn("failed to add fdt `%s` to debugfs: %d\n", name,
+                               err);
+                       continue;
+               }
+       }
+
+       return 0;
+
+err_rmdir:
+       debugfs_remove_recursive(kho_in.dir);
+       return err;
+}
+
 static __init int kho_init(void)
 {
        int err = 0;
+       const void *fdt = kho_get_fdt();
 
        if (!kho_enable)
                return 0;
@@ -524,6 +630,20 @@ static __init int kho_init(void)
        if (err)
                goto err_free_fdt;
 
+       if (fdt) {
+               err = kho_in_debugfs_init(fdt);
+               /*
+                * Failure to create /sys/kernel/debug/kho/in does not prevent
+                * reviving state from KHO and setting up KHO for the next
+                * kexec.
+                */
+               if (err)
+                       pr_err("failed exposing handover FDT in debugfs: %d\n",
+                              err);
+
+               return 0;
+       }
+
        for (int i = 0; i < kho_scratch_cnt; i++) {
                unsigned long base_pfn = PHYS_PFN(kho_scratch[i].addr);
                unsigned long count = kho_scratch[i].size >> PAGE_SHIFT;
@@ -551,7 +671,118 @@ err_free_scratch:
 }
 late_initcall(kho_init);
 
+static void __init kho_release_scratch(void)
+{
+       phys_addr_t start, end;
+       u64 i;
+
+       memmap_init_kho_scratch_pages();
+
+       /*
+        * Mark scratch mem as CMA before we return it. That way we
+        * ensure that no kernel allocations happen on it. That means
+        * we can reuse it as scratch memory again later.
+        */
+       __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,
+                            MEMBLOCK_KHO_SCRATCH, &start, &end, NULL) {
+               ulong start_pfn = pageblock_start_pfn(PFN_DOWN(start));
+               ulong end_pfn = pageblock_align(PFN_UP(end));
+               ulong pfn;
+
+               for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages)
+                       set_pageblock_migratetype(pfn_to_page(pfn),
+                                                 MIGRATE_CMA);
+       }
+}
+
 void __init kho_memory_init(void)
 {
-       kho_reserve_scratch();
+       if (kho_in.scratch_phys) {
+               kho_scratch = phys_to_virt(kho_in.scratch_phys);
+               kho_release_scratch();
+       } else {
+               kho_reserve_scratch();
+       }
+}
+
+void __init kho_populate(phys_addr_t fdt_phys, u64 fdt_len,
+                        phys_addr_t scratch_phys, u64 scratch_len)
+{
+       void *fdt = NULL;
+       struct kho_scratch *scratch = NULL;
+       int err = 0;
+       unsigned int scratch_cnt = scratch_len / sizeof(*kho_scratch);
+
+       /* Validate the input FDT */
+       fdt = early_memremap(fdt_phys, fdt_len);
+       if (!fdt) {
+               pr_warn("setup: failed to memremap FDT (0x%llx)\n", fdt_phys);
+               err = -EFAULT;
+               goto out;
+       }
+       err = fdt_check_header(fdt);
+       if (err) {
+               pr_warn("setup: handover FDT (0x%llx) is invalid: %d\n",
+                       fdt_phys, err);
+               err = -EINVAL;
+               goto out;
+       }
+       err = fdt_node_check_compatible(fdt, 0, KHO_FDT_COMPATIBLE);
+       if (err) {
+               pr_warn("setup: handover FDT (0x%llx) is incompatible with '%s': %d\n",
+                       fdt_phys, KHO_FDT_COMPATIBLE, err);
+               err = -EINVAL;
+               goto out;
+       }
+
+       scratch = early_memremap(scratch_phys, scratch_len);
+       if (!scratch) {
+               pr_warn("setup: failed to memremap scratch (phys=0x%llx, len=%lld)\n",
+                       scratch_phys, scratch_len);
+               err = -EFAULT;
+               goto out;
+       }
+
+       /*
+        * We pass a safe contiguous blocks of memory to use for early boot
+        * purporses from the previous kernel so that we can resize the
+        * memblock array as needed.
+        */
+       for (int i = 0; i < scratch_cnt; i++) {
+               struct kho_scratch *area = &scratch[i];
+               u64 size = area->size;
+
+               memblock_add(area->addr, size);
+               err = memblock_mark_kho_scratch(area->addr, size);
+               if (WARN_ON(err)) {
+                       pr_warn("failed to mark the scratch region 0x%pa+0x%pa: %d",
+                               &area->addr, &size, err);
+                       goto out;
+               }
+               pr_debug("Marked 0x%pa+0x%pa as scratch", &area->addr, &size);
+       }
+
+       memblock_reserve(scratch_phys, scratch_len);
+
+       /*
+        * Now that we have a viable region of scratch memory, let's tell
+        * the memblocks allocator to only use that for any allocations.
+        * That way we ensure that nothing scribbles over in use data while
+        * we initialize the page tables which we will need to ingest all
+        * memory reservations from the previous kernel.
+        */
+       memblock_set_kho_scratch_only();
+
+       kho_in.fdt_phys = fdt_phys;
+       kho_in.scratch_phys = scratch_phys;
+       kho_scratch_cnt = scratch_cnt;
+       pr_info("found kexec handover data. Will skip init for some devices\n");
+
+out:
+       if (fdt)
+               early_memunmap(fdt, fdt_len);
+       if (scratch)
+               early_memunmap(scratch, scratch_len);
+       if (err)
+               pr_warn("disabling KHO revival: %d\n", err);
 }
index ec30d850e195aa73aa0f405b52abe8670f8dcabd..8895b95ffb5baa834d5e602ee4fa245bd6c3fdec 100644 (file)
@@ -2394,6 +2394,7 @@ void __init memblock_free_all(void)
        free_unused_memmap();
        reset_all_zones_managed_pages();
 
+       memblock_clear_kho_scratch_only();
        pages = free_low_memory_core_early();
        totalram_pages_add(pages);
 }