vfio/nvgrace-gpu: register device memory for poison handling

author Ankit Agrawal <ankita@nvidia.com>

Sun, 2 Nov 2025 18:44:34 +0000 (18:44 +0000)

committer Andrew Morton <akpm@linux-foundation.org>

Mon, 17 Nov 2025 01:28:30 +0000 (17:28 -0800)
author Ankit Agrawal <ankita@nvidia.com>
Sun, 2 Nov 2025 18:44:34 +0000 (18:44 +0000)
committer Andrew Morton <akpm@linux-foundation.org>
Mon, 17 Nov 2025 01:28:30 +0000 (17:28 -0800)
diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c

index e346392b72f6ab9ccb916ce39df5124b9bd5fa73..3ce56d039cbec42215411bc6328fe177e78bd36b 100644 (file)
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -8,6 +8,10 @@
  #include <linux/delay.h>
  #include <linux/jiffies.h>
  
+#ifdef CONFIG_MEMORY_FAILURE
+#include <linux/memory-failure.h>
+#endif
+
  /*
   * The device memory usable to the workloads running in the VM is cached
   * and showcased as a 64b device BAR (comprising of BAR4 and BAR5 region)
@@ -47,6 +51,9 @@ struct mem_region {
                 void *memaddr;
                 void __iomem *ioaddr;
         };                      /* Base virtual address of the region */
+#ifdef CONFIG_MEMORY_FAILURE
+       struct pfn_address_space pfn_address_space;
+#endif
  };
  
  struct nvgrace_gpu_pci_core_device {
@@ -60,6 +67,28 @@ struct nvgrace_gpu_pci_core_device {
         bool has_mig_hw_bug;
  };
  
+#ifdef CONFIG_MEMORY_FAILURE
+
+static int
+nvgrace_gpu_vfio_pci_register_pfn_range(struct mem_region *region,
+                                       struct vm_area_struct *vma)
+{
+       unsigned long nr_pages;
+       int ret = 0;
+
+       nr_pages = region->memlength >> PAGE_SHIFT;
+
+       region->pfn_address_space.node.start = vma->vm_pgoff;
+       region->pfn_address_space.node.last = vma->vm_pgoff + nr_pages - 1;
+       region->pfn_address_space.mapping = vma->vm_file->f_mapping;
+
+       ret = register_pfn_address_space(&region->pfn_address_space);
+
+       return ret;
+}
+
+#endif
+
  static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev)
  {
         struct nvgrace_gpu_pci_core_device *nvdev =
@@ -127,6 +156,13 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
  
         mutex_destroy(&nvdev->remap_lock);
  
+#ifdef CONFIG_MEMORY_FAILURE
+       if (nvdev->resmem.memlength)
+               unregister_pfn_address_space(&nvdev->resmem.pfn_address_space);
+
+       unregister_pfn_address_space(&nvdev->usemem.pfn_address_space);
+#endif
+
         vfio_pci_core_close_device(core_vdev);
  }
  
@@ -202,7 +238,14 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
  
         vma->vm_pgoff = start_pfn;
  
-       return 0;
+#ifdef CONFIG_MEMORY_FAILURE
+       if (nvdev->resmem.memlength && index == VFIO_PCI_BAR2_REGION_INDEX)
+               ret = nvgrace_gpu_vfio_pci_register_pfn_range(&nvdev->resmem, vma);
+       else if (index == VFIO_PCI_BAR4_REGION_INDEX)
+               ret = nvgrace_gpu_vfio_pci_register_pfn_range(&nvdev->usemem, vma);
+#endif
+
+       return ret;
  }
  
  static long
author	Ankit Agrawal <ankita@nvidia.com>
	Sun, 2 Nov 2025 18:44:34 +0000 (18:44 +0000)
committer	Andrew Morton <akpm@linux-foundation.org>
	Mon, 17 Nov 2025 01:28:30 +0000 (17:28 -0800)