From: Ankit Agrawal Date: Tue, 17 Feb 2026 15:30:10 +0000 (+0000) Subject: hw/vfio: align mmap to power-of-2 of region size for hugepfnmap X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=3863e47828d5bda1776fb7588a2187c7fba1d0c2;p=thirdparty%2Fqemu.git hw/vfio: align mmap to power-of-2 of region size for hugepfnmap On Grace-based systems such as GB200, device memory is exposed as a BAR but the actual mappable size is not power-of-2 aligned. The previous algorithm aligned each sparse mmap area based on its individual size using ctz64() which prevented efficient huge page usage by the kernel. Adjust VFIO region mapping alignment to use the next power-of-2 of the total region size and place the sparse subregions at their appropriate offset. This provides better opportunities to get huge alignment allowing the kernel to use larger page sizes for the VMA. This enables the use of PMD-level huge pages which can significantly improve memory access performance and reduce TLB pressure for large device memory regions. With this change: - Create a single aligned base mapping for the entire region - Change Alignment to be based on pow2ceil(region->size), capped at 1GiB - Unmap gaps between sparse regions - Use MAP_FIXED to overlay sparse mmap areas at their offsets Example VMA for device memory of size 0x2F00F00000 on GB200: Before (misaligned, no hugepfnmap): ff88ff000000-ffb7fff00000 rw-s 400000000000 00:06 727 /dev/vfio/devices/vfio1 After (aligned to 1GiB boundary, hugepfnmap enabled): ff8ac0000000-ffb9c0f00000 rw-s 400000000000 00:06 727 /dev/vfio/devices/vfio1 Requires sparse regions to be sorted by offset (done in previous patch) to correctly identify and handle gaps. cc: Alex Williamson Reviewed-by: Alex Williamson Reviewed-by: Shameer Kolothum Suggested-by: Jason Gunthorpe Signed-off-by: Ankit Agrawal Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/20260217153010.408739-4-ankita@nvidia.com Signed-off-by: Cédric Le Goater --- diff --git a/hw/vfio/region.c b/hw/vfio/region.c index d464eadf9c..47fdc2df34 100644 --- a/hw/vfio/region.c +++ b/hw/vfio/region.c @@ -344,8 +344,11 @@ static bool vfio_region_create_dma_buf(VFIORegion *region, Error **errp) int vfio_region_mmap(VFIORegion *region) { - int i, ret, prot = 0; + void *map_base, *map_align; Error *local_err = NULL; + int i, ret, prot = 0; + off_t map_offset = 0; + size_t align; char *name; int fd; @@ -356,41 +359,61 @@ int vfio_region_mmap(VFIORegion *region) prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0; prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0; - for (i = 0; i < region->nr_mmaps; i++) { - size_t align = MIN(1ULL << ctz64(region->mmaps[i].size), 1 * GiB); - void *map_base, *map_align; + /* + * Align the mmap for more efficient mapping in the kernel. Ideally + * we'd know the PMD and PUD mapping sizes to use as discrete alignment + * intervals, but we don't. As of Linux v6.19, the largest PUD size + * supporting huge pfnmap is 1GiB (ARCH_SUPPORTS_PUD_PFNMAP is only set + * on x86_64). + * + * Align by power-of-two of the size of the entire region - capped + * by 1G - and place the sparse subregions at their appropriate offset. + * This will get maximum alignment. + * + * NB. qemu_memalign() and friends actually allocate memory, whereas + * the region size here can exceed host memory, therefore we manually + * create an oversized anonymous mapping and clean it up for alignment. + */ - /* - * Align the mmap for more efficient mapping in the kernel. Ideally - * we'd know the PMD and PUD mapping sizes to use as discrete alignment - * intervals, but we don't. As of Linux v6.12, the largest PUD size - * supporting huge pfnmap is 1GiB (ARCH_SUPPORTS_PUD_PFNMAP is only set - * on x86_64). Align by power-of-two size, capped at 1GiB. - * - * NB. qemu_memalign() and friends actually allocate memory, whereas - * the region size here can exceed host memory, therefore we manually - * create an oversized anonymous mapping and clean it up for alignment. - */ - map_base = mmap(0, region->mmaps[i].size + align, PROT_NONE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (map_base == MAP_FAILED) { - ret = -errno; - goto no_mmap; - } + align = MIN(pow2ceil(region->size), 1 * GiB); - fd = vfio_device_get_region_fd(region->vbasedev, region->nr); + map_base = mmap(0, region->size + align, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (map_base == MAP_FAILED) { + ret = -errno; + trace_vfio_region_mmap_fault(memory_region_name(region->mem), -1, + region->fd_offset, + region->fd_offset + region->size - 1, ret); + return ret; + } + + fd = vfio_device_get_region_fd(region->vbasedev, region->nr); - map_align = (void *)ROUND_UP((uintptr_t)map_base, (uintptr_t)align); - munmap(map_base, map_align - map_base); - munmap(map_align + region->mmaps[i].size, - align - (map_align - map_base)); + map_align = (void *)ROUND_UP((uintptr_t)map_base, (uintptr_t)align); + munmap(map_base, map_align - map_base); + munmap(map_align + region->size, + align - (map_align - map_base)); - region->mmaps[i].mmap = mmap(map_align, region->mmaps[i].size, prot, + /* + * Regions should already be sorted by vfio_setup_region_sparse_mmaps(). + * This is critical for the following algorithm which relies on range + * offsets being in ascending order. + */ + for (i = 0; i < region->nr_mmaps; i++) { + munmap(map_align + map_offset, region->mmaps[i].offset - map_offset); + region->mmaps[i].mmap = mmap(map_align + region->mmaps[i].offset, + region->mmaps[i].size, prot, MAP_SHARED | MAP_FIXED, fd, region->fd_offset + region->mmaps[i].offset); if (region->mmaps[i].mmap == MAP_FAILED) { ret = -errno; + /* + * Only unmap the rest of the region. Any mmaps that were successful + * will be unmapped in no_mmap. + */ + munmap(map_align + region->mmaps[i].offset, + region->size - region->mmaps[i].offset); goto no_mmap; } @@ -408,6 +431,15 @@ int vfio_region_mmap(VFIORegion *region) region->mmaps[i].offset, region->mmaps[i].offset + region->mmaps[i].size - 1); + + map_offset = region->mmaps[i].offset + region->mmaps[i].size; + } + + /* + * Unmap the rest of the region not covered by sparse mmap. + */ + if (map_offset < region->size) { + munmap(map_align + map_offset, region->size - map_offset); } if (!vfio_region_create_dma_buf(region, &local_err)) {