int vfio_region_mmap(VFIORegion *region)
{
- int i, ret, prot = 0;
+ void *map_base, *map_align;
Error *local_err = NULL;
+ int i, ret, prot = 0;
+ off_t map_offset = 0;
+ size_t align;
char *name;
int fd;
prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0;
prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0;
- for (i = 0; i < region->nr_mmaps; i++) {
- size_t align = MIN(1ULL << ctz64(region->mmaps[i].size), 1 * GiB);
- void *map_base, *map_align;
+ /*
+ * Align the mmap for more efficient mapping in the kernel. Ideally
+ * we'd know the PMD and PUD mapping sizes to use as discrete alignment
+ * intervals, but we don't. As of Linux v6.19, the largest PUD size
+ * supporting huge pfnmap is 1GiB (ARCH_SUPPORTS_PUD_PFNMAP is only set
+ * on x86_64).
+ *
+ * Align by power-of-two of the size of the entire region - capped
+ * by 1G - and place the sparse subregions at their appropriate offset.
+ * This will get maximum alignment.
+ *
+ * NB. qemu_memalign() and friends actually allocate memory, whereas
+ * the region size here can exceed host memory, therefore we manually
+ * create an oversized anonymous mapping and clean it up for alignment.
+ */
- /*
- * Align the mmap for more efficient mapping in the kernel. Ideally
- * we'd know the PMD and PUD mapping sizes to use as discrete alignment
- * intervals, but we don't. As of Linux v6.12, the largest PUD size
- * supporting huge pfnmap is 1GiB (ARCH_SUPPORTS_PUD_PFNMAP is only set
- * on x86_64). Align by power-of-two size, capped at 1GiB.
- *
- * NB. qemu_memalign() and friends actually allocate memory, whereas
- * the region size here can exceed host memory, therefore we manually
- * create an oversized anonymous mapping and clean it up for alignment.
- */
- map_base = mmap(0, region->mmaps[i].size + align, PROT_NONE,
- MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
- if (map_base == MAP_FAILED) {
- ret = -errno;
- goto no_mmap;
- }
+ align = MIN(pow2ceil(region->size), 1 * GiB);
- fd = vfio_device_get_region_fd(region->vbasedev, region->nr);
+ map_base = mmap(0, region->size + align, PROT_NONE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (map_base == MAP_FAILED) {
+ ret = -errno;
+ trace_vfio_region_mmap_fault(memory_region_name(region->mem), -1,
+ region->fd_offset,
+ region->fd_offset + region->size - 1, ret);
+ return ret;
+ }
+
+ fd = vfio_device_get_region_fd(region->vbasedev, region->nr);
- map_align = (void *)ROUND_UP((uintptr_t)map_base, (uintptr_t)align);
- munmap(map_base, map_align - map_base);
- munmap(map_align + region->mmaps[i].size,
- align - (map_align - map_base));
+ map_align = (void *)ROUND_UP((uintptr_t)map_base, (uintptr_t)align);
+ munmap(map_base, map_align - map_base);
+ munmap(map_align + region->size,
+ align - (map_align - map_base));
- region->mmaps[i].mmap = mmap(map_align, region->mmaps[i].size, prot,
+ /*
+ * Regions should already be sorted by vfio_setup_region_sparse_mmaps().
+ * This is critical for the following algorithm which relies on range
+ * offsets being in ascending order.
+ */
+ for (i = 0; i < region->nr_mmaps; i++) {
+ munmap(map_align + map_offset, region->mmaps[i].offset - map_offset);
+ region->mmaps[i].mmap = mmap(map_align + region->mmaps[i].offset,
+ region->mmaps[i].size, prot,
MAP_SHARED | MAP_FIXED, fd,
region->fd_offset +
region->mmaps[i].offset);
if (region->mmaps[i].mmap == MAP_FAILED) {
ret = -errno;
+ /*
+ * Only unmap the rest of the region. Any mmaps that were successful
+ * will be unmapped in no_mmap.
+ */
+ munmap(map_align + region->mmaps[i].offset,
+ region->size - region->mmaps[i].offset);
goto no_mmap;
}
region->mmaps[i].offset,
region->mmaps[i].offset +
region->mmaps[i].size - 1);
+
+ map_offset = region->mmaps[i].offset + region->mmaps[i].size;
+ }
+
+ /*
+ * Unmap the rest of the region not covered by sparse mmap.
+ */
+ if (map_offset < region->size) {
+ munmap(map_align + map_offset, region->size - map_offset);
}
if (!vfio_region_create_dma_buf(region, &local_err)) {