vfio/nvgrace-gpu: Add support for huge pfnmap

author Ankit Agrawal <ankita@nvidia.com>

Thu, 27 Nov 2025 17:06:28 +0000 (17:06 +0000)

committer Alex Williamson <alex@shazbot.org>

Fri, 28 Nov 2025 17:07:25 +0000 (10:07 -0700)
author Ankit Agrawal <ankita@nvidia.com>
Thu, 27 Nov 2025 17:06:28 +0000 (17:06 +0000)
committer Alex Williamson <alex@shazbot.org>
Fri, 28 Nov 2025 17:07:25 +0000 (10:07 -0700)
diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c

index e33f24fbb0a4c2982f4c55caffd47599ede965f8..3034d6adf576a863d267789da32d82e3b5af613e 100644 (file)
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -131,6 +131,59 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
         vfio_pci_core_close_device(core_vdev);
  }
  
+static unsigned long addr_to_pgoff(struct vm_area_struct *vma,
+                                  unsigned long addr)
+{
+       u64 pgoff = vma->vm_pgoff &
+               ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+
+       return ((addr - vma->vm_start) >> PAGE_SHIFT) + pgoff;
+}
+
+static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf,
+                                                 unsigned int order)
+{
+       struct vm_area_struct *vma = vmf->vma;
+       struct nvgrace_gpu_pci_core_device *nvdev = vma->vm_private_data;
+       struct vfio_pci_core_device *vdev = &nvdev->core_device;
+       unsigned int index =
+               vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
+       vm_fault_t ret = VM_FAULT_FALLBACK;
+       struct mem_region *memregion;
+       unsigned long pfn, addr;
+
+       memregion = nvgrace_gpu_memregion(index, nvdev);
+       if (!memregion)
+               return VM_FAULT_SIGBUS;
+
+       addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
+       pfn = PHYS_PFN(memregion->memphys) + addr_to_pgoff(vma, addr);
+
+       if (is_aligned_for_order(vma, addr, pfn, order)) {
+               scoped_guard(rwsem_read, &vdev->memory_lock)
+                       ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order);
+       }
+
+       dev_dbg_ratelimited(&vdev->pdev->dev,
+                           "%s order = %d pfn 0x%lx: 0x%x\n",
+                           __func__, order, pfn,
+                           (unsigned int)ret);
+
+       return ret;
+}
+
+static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf)
+{
+       return nvgrace_gpu_vfio_pci_huge_fault(vmf, 0);
+}
+
+static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = {
+       .fault = nvgrace_gpu_vfio_pci_fault,
+#ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP
+       .huge_fault = nvgrace_gpu_vfio_pci_huge_fault,
+#endif
+};
+
  static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
                             struct vm_area_struct *vma)
  {
@@ -138,10 +191,8 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
                 container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
                              core_device.vdev);
         struct mem_region *memregion;
-       unsigned long start_pfn;
         u64 req_len, pgoff, end;
         unsigned int index;
-       int ret = 0;
  
         index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
  
@@ -158,17 +209,18 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
                 ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
  
         if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) ||
-           check_add_overflow(PHYS_PFN(memregion->memphys), pgoff, &start_pfn) ||
             check_add_overflow(PFN_PHYS(pgoff), req_len, &end))
                 return -EOVERFLOW;
  
         /*
-        * Check that the mapping request does not go beyond available device
-        * memory size
+        * Check that the mapping request does not go beyond the exposed
+        * device memory size.
          */
         if (end > memregion->memlength)
                 return -EINVAL;
  
+       vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
+
         /*
          * The carved out region of the device memory needs the NORMAL_NC
          * property. Communicate as such to the hypervisor.
@@ -185,23 +237,8 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
                 vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
         }
  
-       /*
-        * Perform a PFN map to the memory and back the device BAR by the
-        * GPU memory.
-        *
-        * The available GPU memory size may not be power-of-2 aligned. The
-        * remainder is only backed by vfio_device_ops read/write handlers.
-        *
-        * During device reset, the GPU is safely disconnected to the CPU
-        * and access to the BAR will be immediately returned preventing
-        * machine check.
-        */
-       ret = remap_pfn_range(vma, vma->vm_start, start_pfn,
-                             req_len, vma->vm_page_prot);
-       if (ret)
-               return ret;
-
-       vma->vm_pgoff = start_pfn;
+       vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops;
+       vma->vm_private_data = nvdev;
  
         return 0;
  }
author	Ankit Agrawal <ankita@nvidia.com>
	Thu, 27 Nov 2025 17:06:28 +0000 (17:06 +0000)
committer	Alex Williamson <alex@shazbot.org>
	Fri, 28 Nov 2025 17:07:25 +0000 (10:07 -0700)