2 * iommufd container backend
4 * Copyright (C) 2023 Intel Corporation.
5 * Copyright Red Hat, Inc. 2023
7 * Authors: Yi Liu <yi.l.liu@intel.com>
8 * Eric Auger <eric.auger@redhat.com>
10 * SPDX-License-Identifier: GPL-2.0-or-later
13 #include "qemu/osdep.h"
14 #include <sys/ioctl.h>
15 #include <linux/vfio.h>
16 #include <linux/iommufd.h>
18 #include "hw/vfio/vfio-common.h"
19 #include "qemu/error-report.h"
21 #include "qapi/error.h"
22 #include "sysemu/iommufd.h"
23 #include "hw/qdev-core.h"
24 #include "sysemu/reset.h"
25 #include "qemu/cutils.h"
26 #include "qemu/chardev_open.h"
29 static int iommufd_cdev_map(const VFIOContainerBase
*bcontainer
, hwaddr iova
,
30 ram_addr_t size
, void *vaddr
, bool readonly
)
32 const VFIOIOMMUFDContainer
*container
=
33 container_of(bcontainer
, VFIOIOMMUFDContainer
, bcontainer
);
35 return iommufd_backend_map_dma(container
->be
,
37 iova
, size
, vaddr
, readonly
);
40 static int iommufd_cdev_unmap(const VFIOContainerBase
*bcontainer
,
41 hwaddr iova
, ram_addr_t size
,
44 const VFIOIOMMUFDContainer
*container
=
45 container_of(bcontainer
, VFIOIOMMUFDContainer
, bcontainer
);
47 /* TODO: Handle dma_unmap_bitmap with iotlb args (migration) */
48 return iommufd_backend_unmap_dma(container
->be
,
49 container
->ioas_id
, iova
, size
);
52 static int iommufd_cdev_kvm_device_add(VFIODevice
*vbasedev
, Error
**errp
)
54 return vfio_kvm_device_add_fd(vbasedev
->fd
, errp
);
57 static void iommufd_cdev_kvm_device_del(VFIODevice
*vbasedev
)
61 if (vfio_kvm_device_del_fd(vbasedev
->fd
, &err
)) {
62 error_report_err(err
);
66 static int iommufd_cdev_connect_and_bind(VFIODevice
*vbasedev
, Error
**errp
)
68 IOMMUFDBackend
*iommufd
= vbasedev
->iommufd
;
69 struct vfio_device_bind_iommufd bind
= {
70 .argsz
= sizeof(bind
),
75 ret
= iommufd_backend_connect(iommufd
, errp
);
81 * Add device to kvm-vfio to be prepared for the tracking
82 * in KVM. Especially for some emulated devices, it requires
83 * to have kvm information in the device open.
85 ret
= iommufd_cdev_kvm_device_add(vbasedev
, errp
);
87 goto err_kvm_device_add
;
90 /* Bind device to iommufd */
91 bind
.iommufd
= iommufd
->fd
;
92 ret
= ioctl(vbasedev
->fd
, VFIO_DEVICE_BIND_IOMMUFD
, &bind
);
94 error_setg_errno(errp
, errno
, "error bind device fd=%d to iommufd=%d",
95 vbasedev
->fd
, bind
.iommufd
);
99 vbasedev
->devid
= bind
.out_devid
;
100 trace_iommufd_cdev_connect_and_bind(bind
.iommufd
, vbasedev
->name
,
101 vbasedev
->fd
, vbasedev
->devid
);
104 iommufd_cdev_kvm_device_del(vbasedev
);
106 iommufd_backend_disconnect(iommufd
);
110 static void iommufd_cdev_unbind_and_disconnect(VFIODevice
*vbasedev
)
112 /* Unbind is automatically conducted when device fd is closed */
113 iommufd_cdev_kvm_device_del(vbasedev
);
114 iommufd_backend_disconnect(vbasedev
->iommufd
);
117 static int iommufd_cdev_getfd(const char *sysfs_path
, Error
**errp
)
120 long int ret
= -ENOTTY
;
121 char *path
, *vfio_dev_path
= NULL
, *vfio_path
= NULL
;
129 path
= g_strdup_printf("%s/vfio-dev", sysfs_path
);
132 error_setg_errno(errp
, errno
, "couldn't open directory %s", path
);
136 while ((dent
= readdir(dir
))) {
137 if (!strncmp(dent
->d_name
, "vfio", 4)) {
138 vfio_dev_path
= g_strdup_printf("%s/%s/dev", path
, dent
->d_name
);
143 if (!vfio_dev_path
) {
144 error_setg(errp
, "failed to find vfio-dev/vfioX/dev");
148 if (!g_file_get_contents(vfio_dev_path
, &contents
, &length
, NULL
)) {
149 error_setg(errp
, "failed to load \"%s\"", vfio_dev_path
);
150 goto out_free_dev_path
;
153 if (sscanf(contents
, "%d:%d", &major
, &minor
) != 2) {
154 error_setg(errp
, "failed to get major:minor for \"%s\"", vfio_dev_path
);
155 goto out_free_dev_path
;
158 vfio_devt
= makedev(major
, minor
);
160 vfio_path
= g_strdup_printf("/dev/vfio/devices/%s", dent
->d_name
);
161 ret
= open_cdev(vfio_path
, vfio_devt
);
163 error_setg(errp
, "Failed to open %s", vfio_path
);
166 trace_iommufd_cdev_getfd(vfio_path
, ret
);
170 g_free(vfio_dev_path
);
175 error_prepend(errp
, VFIO_MSG_PREFIX
, path
);
182 static int iommufd_cdev_attach_ioas_hwpt(VFIODevice
*vbasedev
, uint32_t id
,
185 int ret
, iommufd
= vbasedev
->iommufd
->fd
;
186 struct vfio_device_attach_iommufd_pt attach_data
= {
187 .argsz
= sizeof(attach_data
),
192 /* Attach device to an IOAS or hwpt within iommufd */
193 ret
= ioctl(vbasedev
->fd
, VFIO_DEVICE_ATTACH_IOMMUFD_PT
, &attach_data
);
195 error_setg_errno(errp
, errno
,
196 "[iommufd=%d] error attach %s (%d) to id=%d",
197 iommufd
, vbasedev
->name
, vbasedev
->fd
, id
);
199 trace_iommufd_cdev_attach_ioas_hwpt(iommufd
, vbasedev
->name
,
205 static int iommufd_cdev_detach_ioas_hwpt(VFIODevice
*vbasedev
, Error
**errp
)
207 int ret
, iommufd
= vbasedev
->iommufd
->fd
;
208 struct vfio_device_detach_iommufd_pt detach_data
= {
209 .argsz
= sizeof(detach_data
),
213 ret
= ioctl(vbasedev
->fd
, VFIO_DEVICE_DETACH_IOMMUFD_PT
, &detach_data
);
215 error_setg_errno(errp
, errno
, "detach %s failed", vbasedev
->name
);
217 trace_iommufd_cdev_detach_ioas_hwpt(iommufd
, vbasedev
->name
);
222 static int iommufd_cdev_attach_container(VFIODevice
*vbasedev
,
223 VFIOIOMMUFDContainer
*container
,
226 return iommufd_cdev_attach_ioas_hwpt(vbasedev
, container
->ioas_id
, errp
);
229 static void iommufd_cdev_detach_container(VFIODevice
*vbasedev
,
230 VFIOIOMMUFDContainer
*container
)
234 if (iommufd_cdev_detach_ioas_hwpt(vbasedev
, &err
)) {
235 error_report_err(err
);
239 static void iommufd_cdev_container_destroy(VFIOIOMMUFDContainer
*container
)
241 VFIOContainerBase
*bcontainer
= &container
->bcontainer
;
243 if (!QLIST_EMPTY(&bcontainer
->device_list
)) {
246 memory_listener_unregister(&bcontainer
->listener
);
247 vfio_container_destroy(bcontainer
);
248 iommufd_backend_free_id(container
->be
, container
->ioas_id
);
252 static int iommufd_cdev_ram_block_discard_disable(bool state
)
255 * We support coordinated discarding of RAM via the RamDiscardManager.
257 return ram_block_uncoordinated_discard_disable(state
);
260 static int iommufd_cdev_get_info_iova_range(VFIOIOMMUFDContainer
*container
,
261 uint32_t ioas_id
, Error
**errp
)
263 VFIOContainerBase
*bcontainer
= &container
->bcontainer
;
264 struct iommu_ioas_iova_ranges
*info
;
265 struct iommu_iova_range
*iova_ranges
;
266 int ret
, sz
, fd
= container
->be
->fd
;
268 info
= g_malloc0(sizeof(*info
));
269 info
->size
= sizeof(*info
);
270 info
->ioas_id
= ioas_id
;
272 ret
= ioctl(fd
, IOMMU_IOAS_IOVA_RANGES
, info
);
273 if (ret
&& errno
!= EMSGSIZE
) {
277 sz
= info
->num_iovas
* sizeof(struct iommu_iova_range
);
278 info
= g_realloc(info
, sizeof(*info
) + sz
);
279 info
->allowed_iovas
= (uintptr_t)(info
+ 1);
281 ret
= ioctl(fd
, IOMMU_IOAS_IOVA_RANGES
, info
);
286 iova_ranges
= (struct iommu_iova_range
*)(uintptr_t)info
->allowed_iovas
;
288 for (int i
= 0; i
< info
->num_iovas
; i
++) {
289 Range
*range
= g_new(Range
, 1);
291 range_set_bounds(range
, iova_ranges
[i
].start
, iova_ranges
[i
].last
);
292 bcontainer
->iova_ranges
=
293 range_list_insert(bcontainer
->iova_ranges
, range
);
295 bcontainer
->pgsizes
= info
->out_iova_alignment
;
303 error_setg_errno(errp
, errno
, "Cannot get IOVA ranges");
307 static int iommufd_cdev_attach(const char *name
, VFIODevice
*vbasedev
,
308 AddressSpace
*as
, Error
**errp
)
310 VFIOContainerBase
*bcontainer
;
311 VFIOIOMMUFDContainer
*container
;
312 VFIOAddressSpace
*space
;
313 struct vfio_device_info dev_info
= { .argsz
= sizeof(dev_info
) };
317 const VFIOIOMMUClass
*iommufd_vioc
=
318 VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD
));
320 if (vbasedev
->fd
< 0) {
321 devfd
= iommufd_cdev_getfd(vbasedev
->sysfsdev
, errp
);
325 vbasedev
->fd
= devfd
;
327 devfd
= vbasedev
->fd
;
330 ret
= iommufd_cdev_connect_and_bind(vbasedev
, errp
);
332 goto err_connect_bind
;
335 space
= vfio_get_address_space(as
);
337 /* try to attach to an existing container in this space */
338 QLIST_FOREACH(bcontainer
, &space
->containers
, next
) {
339 container
= container_of(bcontainer
, VFIOIOMMUFDContainer
, bcontainer
);
340 if (bcontainer
->ops
!= iommufd_vioc
||
341 vbasedev
->iommufd
!= container
->be
) {
344 if (iommufd_cdev_attach_container(vbasedev
, container
, &err
)) {
345 const char *msg
= error_get_pretty(err
);
347 trace_iommufd_cdev_fail_attach_existing_container(msg
);
351 ret
= iommufd_cdev_ram_block_discard_disable(true);
354 "Cannot set discarding of RAM broken (%d)", ret
);
355 goto err_discard_disable
;
357 goto found_container
;
361 /* Need to allocate a new dedicated container */
362 ret
= iommufd_backend_alloc_ioas(vbasedev
->iommufd
, &ioas_id
, errp
);
367 trace_iommufd_cdev_alloc_ioas(vbasedev
->iommufd
->fd
, ioas_id
);
369 container
= g_malloc0(sizeof(*container
));
370 container
->be
= vbasedev
->iommufd
;
371 container
->ioas_id
= ioas_id
;
373 bcontainer
= &container
->bcontainer
;
374 vfio_container_init(bcontainer
, space
, iommufd_vioc
);
375 QLIST_INSERT_HEAD(&space
->containers
, bcontainer
, next
);
377 ret
= iommufd_cdev_attach_container(vbasedev
, container
, errp
);
379 goto err_attach_container
;
382 ret
= iommufd_cdev_ram_block_discard_disable(true);
384 goto err_discard_disable
;
387 ret
= iommufd_cdev_get_info_iova_range(container
, ioas_id
, &err
);
389 error_append_hint(&err
,
390 "Fallback to default 64bit IOVA range and 4K page size\n");
391 warn_report_err(err
);
393 bcontainer
->pgsizes
= qemu_real_host_page_size();
396 bcontainer
->listener
= vfio_memory_listener
;
397 memory_listener_register(&bcontainer
->listener
, bcontainer
->space
->as
);
399 if (bcontainer
->error
) {
401 error_propagate_prepend(errp
, bcontainer
->error
,
402 "memory listener initialization failed: ");
403 goto err_listener_register
;
406 bcontainer
->initialized
= true;
409 ret
= ioctl(devfd
, VFIO_DEVICE_GET_INFO
, &dev_info
);
411 error_setg_errno(errp
, errno
, "error getting device info");
412 goto err_listener_register
;
415 ret
= vfio_cpr_register_container(bcontainer
, errp
);
417 goto err_listener_register
;
421 * TODO: examine RAM_BLOCK_DISCARD stuff, should we do group level
422 * for discarding incompatibility check as well?
424 if (vbasedev
->ram_block_discard_allowed
) {
425 iommufd_cdev_ram_block_discard_disable(false);
429 vbasedev
->num_irqs
= dev_info
.num_irqs
;
430 vbasedev
->num_regions
= dev_info
.num_regions
;
431 vbasedev
->flags
= dev_info
.flags
;
432 vbasedev
->reset_works
= !!(dev_info
.flags
& VFIO_DEVICE_FLAGS_RESET
);
433 vbasedev
->bcontainer
= bcontainer
;
434 QLIST_INSERT_HEAD(&bcontainer
->device_list
, vbasedev
, container_next
);
435 QLIST_INSERT_HEAD(&vfio_device_list
, vbasedev
, global_next
);
437 trace_iommufd_cdev_device_info(vbasedev
->name
, devfd
, vbasedev
->num_irqs
,
438 vbasedev
->num_regions
, vbasedev
->flags
);
441 err_listener_register
:
442 iommufd_cdev_ram_block_discard_disable(false);
444 iommufd_cdev_detach_container(vbasedev
, container
);
445 err_attach_container
:
446 iommufd_cdev_container_destroy(container
);
448 vfio_put_address_space(space
);
449 iommufd_cdev_unbind_and_disconnect(vbasedev
);
455 static void iommufd_cdev_detach(VFIODevice
*vbasedev
)
457 VFIOContainerBase
*bcontainer
= vbasedev
->bcontainer
;
458 VFIOAddressSpace
*space
= bcontainer
->space
;
459 VFIOIOMMUFDContainer
*container
= container_of(bcontainer
,
460 VFIOIOMMUFDContainer
,
462 QLIST_REMOVE(vbasedev
, global_next
);
463 QLIST_REMOVE(vbasedev
, container_next
);
464 vbasedev
->bcontainer
= NULL
;
466 if (!vbasedev
->ram_block_discard_allowed
) {
467 iommufd_cdev_ram_block_discard_disable(false);
470 vfio_cpr_unregister_container(bcontainer
);
471 iommufd_cdev_detach_container(vbasedev
, container
);
472 iommufd_cdev_container_destroy(container
);
473 vfio_put_address_space(space
);
475 iommufd_cdev_unbind_and_disconnect(vbasedev
);
479 static VFIODevice
*iommufd_cdev_pci_find_by_devid(__u32 devid
)
481 VFIODevice
*vbasedev_iter
;
482 const VFIOIOMMUClass
*iommufd_vioc
=
483 VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD
));
485 QLIST_FOREACH(vbasedev_iter
, &vfio_device_list
, global_next
) {
486 if (vbasedev_iter
->bcontainer
->ops
!= iommufd_vioc
) {
489 if (devid
== vbasedev_iter
->devid
) {
490 return vbasedev_iter
;
496 static VFIOPCIDevice
*
497 iommufd_cdev_dep_get_realized_vpdev(struct vfio_pci_dependent_device
*dep_dev
,
498 VFIODevice
*reset_dev
)
500 VFIODevice
*vbasedev_tmp
;
502 if (dep_dev
->devid
== reset_dev
->devid
||
503 dep_dev
->devid
== VFIO_PCI_DEVID_OWNED
) {
507 vbasedev_tmp
= iommufd_cdev_pci_find_by_devid(dep_dev
->devid
);
508 if (!vbasedev_tmp
|| !vbasedev_tmp
->dev
->realized
||
509 vbasedev_tmp
->type
!= VFIO_DEVICE_TYPE_PCI
) {
513 return container_of(vbasedev_tmp
, VFIOPCIDevice
, vbasedev
);
516 static int iommufd_cdev_pci_hot_reset(VFIODevice
*vbasedev
, bool single
)
518 VFIOPCIDevice
*vdev
= container_of(vbasedev
, VFIOPCIDevice
, vbasedev
);
519 struct vfio_pci_hot_reset_info
*info
= NULL
;
520 struct vfio_pci_dependent_device
*devices
;
521 struct vfio_pci_hot_reset
*reset
;
525 trace_vfio_pci_hot_reset(vdev
->vbasedev
.name
, single
? "one" : "multi");
528 vfio_pci_pre_reset(vdev
);
530 vdev
->vbasedev
.needs_reset
= false;
532 ret
= vfio_pci_get_pci_hot_reset_info(vdev
, &info
);
538 assert(info
->flags
& VFIO_PCI_HOT_RESET_FLAG_DEV_ID
);
540 devices
= &info
->devices
[0];
542 if (!(info
->flags
& VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED
)) {
543 if (!vdev
->has_pm_reset
) {
544 for (i
= 0; i
< info
->count
; i
++) {
545 if (devices
[i
].devid
== VFIO_PCI_DEVID_NOT_OWNED
) {
546 error_report("vfio: Cannot reset device %s, "
547 "depends on device %04x:%02x:%02x.%x "
548 "which is not owned.",
549 vdev
->vbasedev
.name
, devices
[i
].segment
,
550 devices
[i
].bus
, PCI_SLOT(devices
[i
].devfn
),
551 PCI_FUNC(devices
[i
].devfn
));
559 trace_vfio_pci_hot_reset_has_dep_devices(vdev
->vbasedev
.name
);
561 for (i
= 0; i
< info
->count
; i
++) {
564 trace_iommufd_cdev_pci_hot_reset_dep_devices(devices
[i
].segment
,
566 PCI_SLOT(devices
[i
].devfn
),
567 PCI_FUNC(devices
[i
].devfn
),
571 * If a VFIO cdev device is resettable, all the dependent devices
572 * are either bound to same iommufd or within same iommu_groups as
573 * one of the iommufd bound devices.
575 assert(devices
[i
].devid
!= VFIO_PCI_DEVID_NOT_OWNED
);
577 tmp
= iommufd_cdev_dep_get_realized_vpdev(&devices
[i
], &vdev
->vbasedev
);
586 vfio_pci_pre_reset(tmp
);
587 tmp
->vbasedev
.needs_reset
= false;
591 if (!single
&& !multi
) {
596 /* Use zero length array for hot reset with iommufd backend */
597 reset
= g_malloc0(sizeof(*reset
));
598 reset
->argsz
= sizeof(*reset
);
601 ret
= ioctl(vdev
->vbasedev
.fd
, VFIO_DEVICE_PCI_HOT_RESET
, reset
);
607 trace_vfio_pci_hot_reset_result(vdev
->vbasedev
.name
,
608 ret
? strerror(errno
) : "Success");
610 /* Re-enable INTx on affected devices */
611 for (i
= 0; i
< info
->count
; i
++) {
614 tmp
= iommufd_cdev_dep_get_realized_vpdev(&devices
[i
], &vdev
->vbasedev
);
618 vfio_pci_post_reset(tmp
);
622 vfio_pci_post_reset(vdev
);
629 static void vfio_iommu_iommufd_class_init(ObjectClass
*klass
, void *data
)
631 VFIOIOMMUClass
*vioc
= VFIO_IOMMU_CLASS(klass
);
633 vioc
->dma_map
= iommufd_cdev_map
;
634 vioc
->dma_unmap
= iommufd_cdev_unmap
;
635 vioc
->attach_device
= iommufd_cdev_attach
;
636 vioc
->detach_device
= iommufd_cdev_detach
;
637 vioc
->pci_hot_reset
= iommufd_cdev_pci_hot_reset
;
640 static const TypeInfo types
[] = {
642 .name
= TYPE_VFIO_IOMMU_IOMMUFD
,
643 .parent
= TYPE_VFIO_IOMMU
,
644 .class_init
= vfio_iommu_iommufd_class_init
,