]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
x86/hyperv: Add kexec/kdump support on Azure CVMs
authorVitaly Kuznetsov <vkuznets@redhat.com>
Thu, 28 Aug 2025 09:16:18 +0000 (12:16 +0300)
committerWei Liu <wei.liu@kernel.org>
Tue, 30 Sep 2025 22:49:24 +0000 (22:49 +0000)
Azure CVM instance types featuring a paravisor hang upon kdump. The
investigation shows that makedumpfile causes a hang when it steps on a page
which was previously share with the host
(HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY). The new kernel has no
knowledge of these 'special' regions (which are Vmbus connection pages,
GPADL buffers, ...). There are several ways to approach the issue:
- Convey the knowledge about these regions to the new kernel somehow.
- Unshare these regions before accessing in the new kernel (it is unclear
if there's a way to query the status for a given GPA range).
- Unshare these regions before jumping to the new kernel (which this patch
implements).

To make the procedure as robust as possible, store PFN ranges of shared
regions in a linked list instead of storing GVAs and re-using
hv_vtom_set_host_visibility(). This also allows to avoid memory allocation
on the kdump/kexec path.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Reviewed-by: Tianyu Lan <tiala@microsoft.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
arch/x86/hyperv/ivm.c

index ade6c665c97e9cd9a68cab0287dbbe8b6c417fd2..a4615b889f3e6c84801d2cf4217d65f0866ab04b 100644 (file)
@@ -462,6 +462,195 @@ void hv_ivm_msr_read(u64 msr, u64 *value)
                hv_ghcb_msr_read(msr, value);
 }
 
+/*
+ * Keep track of the PFN regions which were shared with the host. The access
+ * must be revoked upon kexec/kdump (see hv_ivm_clear_host_access()).
+ */
+struct hv_enc_pfn_region {
+       struct list_head list;
+       u64 pfn;
+       int count;
+};
+
+static LIST_HEAD(hv_list_enc);
+static DEFINE_RAW_SPINLOCK(hv_list_enc_lock);
+
+static int hv_list_enc_add(const u64 *pfn_list, int count)
+{
+       struct hv_enc_pfn_region *ent;
+       unsigned long flags;
+       u64 pfn;
+       int i;
+
+       for (i = 0; i < count; i++) {
+               pfn = pfn_list[i];
+
+               raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
+               /* Check if the PFN already exists in some region first */
+               list_for_each_entry(ent, &hv_list_enc, list) {
+                       if ((ent->pfn <= pfn) && (ent->pfn + ent->count - 1 >= pfn))
+                               /* Nothing to do - pfn is already in the list */
+                               goto unlock_done;
+               }
+
+               /*
+                * Check if the PFN is adjacent to an existing region. Growing
+                * a region can make it adjacent to another one but merging is
+                * not (yet) implemented for simplicity. A PFN cannot be added
+                * to two regions to keep the logic in hv_list_enc_remove()
+                * correct.
+                */
+               list_for_each_entry(ent, &hv_list_enc, list) {
+                       if (ent->pfn + ent->count == pfn) {
+                               /* Grow existing region up */
+                               ent->count++;
+                               goto unlock_done;
+                       } else if (pfn + 1 == ent->pfn) {
+                               /* Grow existing region down */
+                               ent->pfn--;
+                               ent->count++;
+                               goto unlock_done;
+                       }
+               }
+               raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+
+               /* No adjacent region found -- create a new one */
+               ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL);
+               if (!ent)
+                       return -ENOMEM;
+
+               ent->pfn = pfn;
+               ent->count = 1;
+
+               raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
+               list_add(&ent->list, &hv_list_enc);
+
+unlock_done:
+               raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+       }
+
+       return 0;
+}
+
+static int hv_list_enc_remove(const u64 *pfn_list, int count)
+{
+       struct hv_enc_pfn_region *ent, *t;
+       struct hv_enc_pfn_region new_region;
+       unsigned long flags;
+       u64 pfn;
+       int i;
+
+       for (i = 0; i < count; i++) {
+               pfn = pfn_list[i];
+
+               raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
+               list_for_each_entry_safe(ent, t, &hv_list_enc, list) {
+                       if (pfn == ent->pfn + ent->count - 1) {
+                               /* Removing tail pfn */
+                               ent->count--;
+                               if (!ent->count) {
+                                       list_del(&ent->list);
+                                       kfree(ent);
+                               }
+                               goto unlock_done;
+                       } else if (pfn == ent->pfn) {
+                               /* Removing head pfn */
+                               ent->count--;
+                               ent->pfn++;
+                               if (!ent->count) {
+                                       list_del(&ent->list);
+                                       kfree(ent);
+                               }
+                               goto unlock_done;
+                       } else if (pfn > ent->pfn && pfn < ent->pfn + ent->count - 1) {
+                               /*
+                                * Removing a pfn in the middle. Cut off the tail
+                                * of the existing region and create a template for
+                                * the new one.
+                                */
+                               new_region.pfn = pfn + 1;
+                               new_region.count = ent->count - (pfn - ent->pfn + 1);
+                               ent->count = pfn - ent->pfn;
+                               goto unlock_split;
+                       }
+
+               }
+unlock_done:
+               raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+               continue;
+
+unlock_split:
+               raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+
+               ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL);
+               if (!ent)
+                       return -ENOMEM;
+
+               ent->pfn = new_region.pfn;
+               ent->count = new_region.count;
+
+               raw_spin_lock_irqsave(&hv_list_enc_lock, flags);
+               list_add(&ent->list, &hv_list_enc);
+               raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags);
+       }
+
+       return 0;
+}
+
+/* Stop new private<->shared conversions */
+static void hv_vtom_kexec_begin(void)
+{
+       if (!IS_ENABLED(CONFIG_KEXEC_CORE))
+               return;
+
+       /*
+        * Crash kernel reaches here with interrupts disabled: can't wait for
+        * conversions to finish.
+        *
+        * If race happened, just report and proceed.
+        */
+       if (!set_memory_enc_stop_conversion())
+               pr_warn("Failed to stop shared<->private conversions\n");
+}
+
+static void hv_vtom_kexec_finish(void)
+{
+       struct hv_gpa_range_for_visibility *input;
+       struct hv_enc_pfn_region *ent;
+       unsigned long flags;
+       u64 hv_status;
+       int cur, i;
+
+       local_irq_save(flags);
+       input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+
+       if (unlikely(!input))
+               goto out;
+
+       list_for_each_entry(ent, &hv_list_enc, list) {
+               for (i = 0, cur = 0; i < ent->count; i++) {
+                       input->gpa_page_list[cur] = ent->pfn + i;
+                       cur++;
+
+                       if (cur == HV_MAX_MODIFY_GPA_REP_COUNT || i == ent->count - 1) {
+                               input->partition_id = HV_PARTITION_ID_SELF;
+                               input->host_visibility = VMBUS_PAGE_NOT_VISIBLE;
+                               input->reserved0 = 0;
+                               input->reserved1 = 0;
+                               hv_status = hv_do_rep_hypercall(
+                                       HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY,
+                                       cur, 0, input, NULL);
+                               WARN_ON_ONCE(!hv_result_success(hv_status));
+                               cur = 0;
+                       }
+               }
+
+       }
+
+out:
+       local_irq_restore(flags);
+}
+
 /*
  * hv_mark_gpa_visibility - Set pages visible to host via hvcall.
  *
@@ -475,6 +664,7 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
        struct hv_gpa_range_for_visibility *input;
        u64 hv_status;
        unsigned long flags;
+       int ret;
 
        /* no-op if partition isolation is not enabled */
        if (!hv_is_isolation_supported())
@@ -486,6 +676,13 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
                return -EINVAL;
        }
 
+       if (visibility == VMBUS_PAGE_NOT_VISIBLE)
+               ret = hv_list_enc_remove(pfn, count);
+       else
+               ret = hv_list_enc_add(pfn, count);
+       if (ret)
+               return ret;
+
        local_irq_save(flags);
        input = *this_cpu_ptr(hyperv_pcpu_input_arg);
 
@@ -506,8 +703,18 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[],
 
        if (hv_result_success(hv_status))
                return 0;
+
+       if (visibility == VMBUS_PAGE_NOT_VISIBLE)
+               ret = hv_list_enc_add(pfn, count);
        else
-               return -EFAULT;
+               ret = hv_list_enc_remove(pfn, count);
+       /*
+        * There's no good way to recover from -ENOMEM here, the accounting is
+        * wrong either way.
+        */
+       WARN_ON_ONCE(ret);
+
+       return -EFAULT;
 }
 
 /*
@@ -669,6 +876,8 @@ void __init hv_vtom_init(void)
        x86_platform.guest.enc_tlb_flush_required = hv_vtom_tlb_flush_required;
        x86_platform.guest.enc_status_change_prepare = hv_vtom_clear_present;
        x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility;
+       x86_platform.guest.enc_kexec_begin = hv_vtom_kexec_begin;
+       x86_platform.guest.enc_kexec_finish = hv_vtom_kexec_finish;
 
        /* Set WB as the default cache mode. */
        guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK);