]> git.ipfire.org Git - thirdparty/kernel/stable.git/blob - drivers/vfio/vfio.c
vfio: Fix WARNING "do not call blocking ops when !TASK_RUNNING"
[thirdparty/kernel/stable.git] / drivers / vfio / vfio.c
1 /*
2 * VFIO core
3 *
4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * Derived from original vfio:
12 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
13 * Author: Tom Lyon, pugs@cisco.com
14 */
15
16 #include <linux/cdev.h>
17 #include <linux/compat.h>
18 #include <linux/device.h>
19 #include <linux/file.h>
20 #include <linux/anon_inodes.h>
21 #include <linux/fs.h>
22 #include <linux/idr.h>
23 #include <linux/iommu.h>
24 #include <linux/list.h>
25 #include <linux/miscdevice.h>
26 #include <linux/module.h>
27 #include <linux/mutex.h>
28 #include <linux/pci.h>
29 #include <linux/rwsem.h>
30 #include <linux/sched.h>
31 #include <linux/slab.h>
32 #include <linux/stat.h>
33 #include <linux/string.h>
34 #include <linux/uaccess.h>
35 #include <linux/vfio.h>
36 #include <linux/wait.h>
37 #include <linux/sched/signal.h>
38
39 #define DRIVER_VERSION "0.3"
40 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
41 #define DRIVER_DESC "VFIO - User Level meta-driver"
42
43 static struct vfio {
44 struct class *class;
45 struct list_head iommu_drivers_list;
46 struct mutex iommu_drivers_lock;
47 struct list_head group_list;
48 struct idr group_idr;
49 struct mutex group_lock;
50 struct cdev group_cdev;
51 dev_t group_devt;
52 wait_queue_head_t release_q;
53 } vfio;
54
55 struct vfio_iommu_driver {
56 const struct vfio_iommu_driver_ops *ops;
57 struct list_head vfio_next;
58 };
59
60 struct vfio_container {
61 struct kref kref;
62 struct list_head group_list;
63 struct rw_semaphore group_lock;
64 struct vfio_iommu_driver *iommu_driver;
65 void *iommu_data;
66 bool noiommu;
67 };
68
69 struct vfio_unbound_dev {
70 struct device *dev;
71 struct list_head unbound_next;
72 };
73
74 struct vfio_group {
75 struct kref kref;
76 int minor;
77 atomic_t container_users;
78 struct iommu_group *iommu_group;
79 struct vfio_container *container;
80 struct list_head device_list;
81 struct mutex device_lock;
82 struct device *dev;
83 struct notifier_block nb;
84 struct list_head vfio_next;
85 struct list_head container_next;
86 struct list_head unbound_list;
87 struct mutex unbound_lock;
88 atomic_t opened;
89 wait_queue_head_t container_q;
90 bool noiommu;
91 struct kvm *kvm;
92 struct blocking_notifier_head notifier;
93 };
94
95 struct vfio_device {
96 struct kref kref;
97 struct device *dev;
98 const struct vfio_device_ops *ops;
99 struct vfio_group *group;
100 struct list_head group_next;
101 void *device_data;
102 };
103
104 #ifdef CONFIG_VFIO_NOIOMMU
105 static bool noiommu __read_mostly;
106 module_param_named(enable_unsafe_noiommu_mode,
107 noiommu, bool, S_IRUGO | S_IWUSR);
108 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
109 #endif
110
111 /*
112 * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
113 * and remove functions, any use cases other than acquiring the first
114 * reference for the purpose of calling vfio_add_group_dev() or removing
115 * that symmetric reference after vfio_del_group_dev() should use the raw
116 * iommu_group_{get,put} functions. In particular, vfio_iommu_group_put()
117 * removes the device from the dummy group and cannot be nested.
118 */
119 struct iommu_group *vfio_iommu_group_get(struct device *dev)
120 {
121 struct iommu_group *group;
122 int __maybe_unused ret;
123
124 group = iommu_group_get(dev);
125
126 #ifdef CONFIG_VFIO_NOIOMMU
127 /*
128 * With noiommu enabled, an IOMMU group will be created for a device
129 * that doesn't already have one and doesn't have an iommu_ops on their
130 * bus. We set iommudata simply to be able to identify these groups
131 * as special use and for reclamation later.
132 */
133 if (group || !noiommu || iommu_present(dev->bus))
134 return group;
135
136 group = iommu_group_alloc();
137 if (IS_ERR(group))
138 return NULL;
139
140 iommu_group_set_name(group, "vfio-noiommu");
141 iommu_group_set_iommudata(group, &noiommu, NULL);
142 ret = iommu_group_add_device(group, dev);
143 if (ret) {
144 iommu_group_put(group);
145 return NULL;
146 }
147
148 /*
149 * Where to taint? At this point we've added an IOMMU group for a
150 * device that is not backed by iommu_ops, therefore any iommu_
151 * callback using iommu_ops can legitimately Oops. So, while we may
152 * be about to give a DMA capable device to a user without IOMMU
153 * protection, which is clearly taint-worthy, let's go ahead and do
154 * it here.
155 */
156 add_taint(TAINT_USER, LOCKDEP_STILL_OK);
157 dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
158 #endif
159
160 return group;
161 }
162 EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
163
164 void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
165 {
166 #ifdef CONFIG_VFIO_NOIOMMU
167 if (iommu_group_get_iommudata(group) == &noiommu)
168 iommu_group_remove_device(dev);
169 #endif
170
171 iommu_group_put(group);
172 }
173 EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
174
175 #ifdef CONFIG_VFIO_NOIOMMU
176 static void *vfio_noiommu_open(unsigned long arg)
177 {
178 if (arg != VFIO_NOIOMMU_IOMMU)
179 return ERR_PTR(-EINVAL);
180 if (!capable(CAP_SYS_RAWIO))
181 return ERR_PTR(-EPERM);
182
183 return NULL;
184 }
185
186 static void vfio_noiommu_release(void *iommu_data)
187 {
188 }
189
190 static long vfio_noiommu_ioctl(void *iommu_data,
191 unsigned int cmd, unsigned long arg)
192 {
193 if (cmd == VFIO_CHECK_EXTENSION)
194 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
195
196 return -ENOTTY;
197 }
198
199 static int vfio_noiommu_attach_group(void *iommu_data,
200 struct iommu_group *iommu_group)
201 {
202 return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
203 }
204
205 static void vfio_noiommu_detach_group(void *iommu_data,
206 struct iommu_group *iommu_group)
207 {
208 }
209
210 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
211 .name = "vfio-noiommu",
212 .owner = THIS_MODULE,
213 .open = vfio_noiommu_open,
214 .release = vfio_noiommu_release,
215 .ioctl = vfio_noiommu_ioctl,
216 .attach_group = vfio_noiommu_attach_group,
217 .detach_group = vfio_noiommu_detach_group,
218 };
219 #endif
220
221
222 /**
223 * IOMMU driver registration
224 */
225 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
226 {
227 struct vfio_iommu_driver *driver, *tmp;
228
229 driver = kzalloc(sizeof(*driver), GFP_KERNEL);
230 if (!driver)
231 return -ENOMEM;
232
233 driver->ops = ops;
234
235 mutex_lock(&vfio.iommu_drivers_lock);
236
237 /* Check for duplicates */
238 list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
239 if (tmp->ops == ops) {
240 mutex_unlock(&vfio.iommu_drivers_lock);
241 kfree(driver);
242 return -EINVAL;
243 }
244 }
245
246 list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
247
248 mutex_unlock(&vfio.iommu_drivers_lock);
249
250 return 0;
251 }
252 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
253
254 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
255 {
256 struct vfio_iommu_driver *driver;
257
258 mutex_lock(&vfio.iommu_drivers_lock);
259 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
260 if (driver->ops == ops) {
261 list_del(&driver->vfio_next);
262 mutex_unlock(&vfio.iommu_drivers_lock);
263 kfree(driver);
264 return;
265 }
266 }
267 mutex_unlock(&vfio.iommu_drivers_lock);
268 }
269 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
270
271 /**
272 * Group minor allocation/free - both called with vfio.group_lock held
273 */
274 static int vfio_alloc_group_minor(struct vfio_group *group)
275 {
276 return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
277 }
278
279 static void vfio_free_group_minor(int minor)
280 {
281 idr_remove(&vfio.group_idr, minor);
282 }
283
284 static int vfio_iommu_group_notifier(struct notifier_block *nb,
285 unsigned long action, void *data);
286 static void vfio_group_get(struct vfio_group *group);
287
288 /**
289 * Container objects - containers are created when /dev/vfio/vfio is
290 * opened, but their lifecycle extends until the last user is done, so
291 * it's freed via kref. Must support container/group/device being
292 * closed in any order.
293 */
294 static void vfio_container_get(struct vfio_container *container)
295 {
296 kref_get(&container->kref);
297 }
298
299 static void vfio_container_release(struct kref *kref)
300 {
301 struct vfio_container *container;
302 container = container_of(kref, struct vfio_container, kref);
303
304 kfree(container);
305 }
306
307 static void vfio_container_put(struct vfio_container *container)
308 {
309 kref_put(&container->kref, vfio_container_release);
310 }
311
312 static void vfio_group_unlock_and_free(struct vfio_group *group)
313 {
314 mutex_unlock(&vfio.group_lock);
315 /*
316 * Unregister outside of lock. A spurious callback is harmless now
317 * that the group is no longer in vfio.group_list.
318 */
319 iommu_group_unregister_notifier(group->iommu_group, &group->nb);
320 kfree(group);
321 }
322
323 /**
324 * Group objects - create, release, get, put, search
325 */
326 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
327 {
328 struct vfio_group *group, *tmp;
329 struct device *dev;
330 int ret, minor;
331
332 group = kzalloc(sizeof(*group), GFP_KERNEL);
333 if (!group)
334 return ERR_PTR(-ENOMEM);
335
336 kref_init(&group->kref);
337 INIT_LIST_HEAD(&group->device_list);
338 mutex_init(&group->device_lock);
339 INIT_LIST_HEAD(&group->unbound_list);
340 mutex_init(&group->unbound_lock);
341 atomic_set(&group->container_users, 0);
342 atomic_set(&group->opened, 0);
343 init_waitqueue_head(&group->container_q);
344 group->iommu_group = iommu_group;
345 #ifdef CONFIG_VFIO_NOIOMMU
346 group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
347 #endif
348 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
349
350 group->nb.notifier_call = vfio_iommu_group_notifier;
351
352 /*
353 * blocking notifiers acquire a rwsem around registering and hold
354 * it around callback. Therefore, need to register outside of
355 * vfio.group_lock to avoid A-B/B-A contention. Our callback won't
356 * do anything unless it can find the group in vfio.group_list, so
357 * no harm in registering early.
358 */
359 ret = iommu_group_register_notifier(iommu_group, &group->nb);
360 if (ret) {
361 kfree(group);
362 return ERR_PTR(ret);
363 }
364
365 mutex_lock(&vfio.group_lock);
366
367 /* Did we race creating this group? */
368 list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
369 if (tmp->iommu_group == iommu_group) {
370 vfio_group_get(tmp);
371 vfio_group_unlock_and_free(group);
372 return tmp;
373 }
374 }
375
376 minor = vfio_alloc_group_minor(group);
377 if (minor < 0) {
378 vfio_group_unlock_and_free(group);
379 return ERR_PTR(minor);
380 }
381
382 dev = device_create(vfio.class, NULL,
383 MKDEV(MAJOR(vfio.group_devt), minor),
384 group, "%s%d", group->noiommu ? "noiommu-" : "",
385 iommu_group_id(iommu_group));
386 if (IS_ERR(dev)) {
387 vfio_free_group_minor(minor);
388 vfio_group_unlock_and_free(group);
389 return ERR_CAST(dev);
390 }
391
392 group->minor = minor;
393 group->dev = dev;
394
395 list_add(&group->vfio_next, &vfio.group_list);
396
397 mutex_unlock(&vfio.group_lock);
398
399 return group;
400 }
401
402 /* called with vfio.group_lock held */
403 static void vfio_group_release(struct kref *kref)
404 {
405 struct vfio_group *group = container_of(kref, struct vfio_group, kref);
406 struct vfio_unbound_dev *unbound, *tmp;
407 struct iommu_group *iommu_group = group->iommu_group;
408
409 WARN_ON(!list_empty(&group->device_list));
410 WARN_ON(group->notifier.head);
411
412 list_for_each_entry_safe(unbound, tmp,
413 &group->unbound_list, unbound_next) {
414 list_del(&unbound->unbound_next);
415 kfree(unbound);
416 }
417
418 device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
419 list_del(&group->vfio_next);
420 vfio_free_group_minor(group->minor);
421 vfio_group_unlock_and_free(group);
422 iommu_group_put(iommu_group);
423 }
424
425 static void vfio_group_put(struct vfio_group *group)
426 {
427 kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
428 }
429
430 struct vfio_group_put_work {
431 struct work_struct work;
432 struct vfio_group *group;
433 };
434
435 static void vfio_group_put_bg(struct work_struct *work)
436 {
437 struct vfio_group_put_work *do_work;
438
439 do_work = container_of(work, struct vfio_group_put_work, work);
440
441 vfio_group_put(do_work->group);
442 kfree(do_work);
443 }
444
445 static void vfio_group_schedule_put(struct vfio_group *group)
446 {
447 struct vfio_group_put_work *do_work;
448
449 do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
450 if (WARN_ON(!do_work))
451 return;
452
453 INIT_WORK(&do_work->work, vfio_group_put_bg);
454 do_work->group = group;
455 schedule_work(&do_work->work);
456 }
457
458 /* Assume group_lock or group reference is held */
459 static void vfio_group_get(struct vfio_group *group)
460 {
461 kref_get(&group->kref);
462 }
463
464 /*
465 * Not really a try as we will sleep for mutex, but we need to make
466 * sure the group pointer is valid under lock and get a reference.
467 */
468 static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
469 {
470 struct vfio_group *target = group;
471
472 mutex_lock(&vfio.group_lock);
473 list_for_each_entry(group, &vfio.group_list, vfio_next) {
474 if (group == target) {
475 vfio_group_get(group);
476 mutex_unlock(&vfio.group_lock);
477 return group;
478 }
479 }
480 mutex_unlock(&vfio.group_lock);
481
482 return NULL;
483 }
484
485 static
486 struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
487 {
488 struct vfio_group *group;
489
490 mutex_lock(&vfio.group_lock);
491 list_for_each_entry(group, &vfio.group_list, vfio_next) {
492 if (group->iommu_group == iommu_group) {
493 vfio_group_get(group);
494 mutex_unlock(&vfio.group_lock);
495 return group;
496 }
497 }
498 mutex_unlock(&vfio.group_lock);
499
500 return NULL;
501 }
502
503 static struct vfio_group *vfio_group_get_from_minor(int minor)
504 {
505 struct vfio_group *group;
506
507 mutex_lock(&vfio.group_lock);
508 group = idr_find(&vfio.group_idr, minor);
509 if (!group) {
510 mutex_unlock(&vfio.group_lock);
511 return NULL;
512 }
513 vfio_group_get(group);
514 mutex_unlock(&vfio.group_lock);
515
516 return group;
517 }
518
519 static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
520 {
521 struct iommu_group *iommu_group;
522 struct vfio_group *group;
523
524 iommu_group = iommu_group_get(dev);
525 if (!iommu_group)
526 return NULL;
527
528 group = vfio_group_get_from_iommu(iommu_group);
529 iommu_group_put(iommu_group);
530
531 return group;
532 }
533
534 /**
535 * Device objects - create, release, get, put, search
536 */
537 static
538 struct vfio_device *vfio_group_create_device(struct vfio_group *group,
539 struct device *dev,
540 const struct vfio_device_ops *ops,
541 void *device_data)
542 {
543 struct vfio_device *device;
544
545 device = kzalloc(sizeof(*device), GFP_KERNEL);
546 if (!device)
547 return ERR_PTR(-ENOMEM);
548
549 kref_init(&device->kref);
550 device->dev = dev;
551 device->group = group;
552 device->ops = ops;
553 device->device_data = device_data;
554 dev_set_drvdata(dev, device);
555
556 /* No need to get group_lock, caller has group reference */
557 vfio_group_get(group);
558
559 mutex_lock(&group->device_lock);
560 list_add(&device->group_next, &group->device_list);
561 mutex_unlock(&group->device_lock);
562
563 return device;
564 }
565
566 static void vfio_device_release(struct kref *kref)
567 {
568 struct vfio_device *device = container_of(kref,
569 struct vfio_device, kref);
570 struct vfio_group *group = device->group;
571
572 list_del(&device->group_next);
573 mutex_unlock(&group->device_lock);
574
575 dev_set_drvdata(device->dev, NULL);
576
577 kfree(device);
578
579 /* vfio_del_group_dev may be waiting for this device */
580 wake_up(&vfio.release_q);
581 }
582
583 /* Device reference always implies a group reference */
584 void vfio_device_put(struct vfio_device *device)
585 {
586 struct vfio_group *group = device->group;
587 kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
588 vfio_group_put(group);
589 }
590 EXPORT_SYMBOL_GPL(vfio_device_put);
591
592 static void vfio_device_get(struct vfio_device *device)
593 {
594 vfio_group_get(device->group);
595 kref_get(&device->kref);
596 }
597
598 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
599 struct device *dev)
600 {
601 struct vfio_device *device;
602
603 mutex_lock(&group->device_lock);
604 list_for_each_entry(device, &group->device_list, group_next) {
605 if (device->dev == dev) {
606 vfio_device_get(device);
607 mutex_unlock(&group->device_lock);
608 return device;
609 }
610 }
611 mutex_unlock(&group->device_lock);
612 return NULL;
613 }
614
615 /*
616 * Some drivers, like pci-stub, are only used to prevent other drivers from
617 * claiming a device and are therefore perfectly legitimate for a user owned
618 * group. The pci-stub driver has no dependencies on DMA or the IOVA mapping
619 * of the device, but it does prevent the user from having direct access to
620 * the device, which is useful in some circumstances.
621 *
622 * We also assume that we can include PCI interconnect devices, ie. bridges.
623 * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
624 * then all of the downstream devices will be part of the same IOMMU group as
625 * the bridge. Thus, if placing the bridge into the user owned IOVA space
626 * breaks anything, it only does so for user owned devices downstream. Note
627 * that error notification via MSI can be affected for platforms that handle
628 * MSI within the same IOVA space as DMA.
629 */
630 static const char * const vfio_driver_whitelist[] = { "pci-stub" };
631
632 static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
633 {
634 int i;
635
636 if (dev_is_pci(dev)) {
637 struct pci_dev *pdev = to_pci_dev(dev);
638
639 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
640 return true;
641 }
642
643 for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) {
644 if (!strcmp(drv->name, vfio_driver_whitelist[i]))
645 return true;
646 }
647
648 return false;
649 }
650
651 /*
652 * A vfio group is viable for use by userspace if all devices are in
653 * one of the following states:
654 * - driver-less
655 * - bound to a vfio driver
656 * - bound to a whitelisted driver
657 * - a PCI interconnect device
658 *
659 * We use two methods to determine whether a device is bound to a vfio
660 * driver. The first is to test whether the device exists in the vfio
661 * group. The second is to test if the device exists on the group
662 * unbound_list, indicating it's in the middle of transitioning from
663 * a vfio driver to driver-less.
664 */
665 static int vfio_dev_viable(struct device *dev, void *data)
666 {
667 struct vfio_group *group = data;
668 struct vfio_device *device;
669 struct device_driver *drv = ACCESS_ONCE(dev->driver);
670 struct vfio_unbound_dev *unbound;
671 int ret = -EINVAL;
672
673 mutex_lock(&group->unbound_lock);
674 list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
675 if (dev == unbound->dev) {
676 ret = 0;
677 break;
678 }
679 }
680 mutex_unlock(&group->unbound_lock);
681
682 if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
683 return 0;
684
685 device = vfio_group_get_device(group, dev);
686 if (device) {
687 vfio_device_put(device);
688 return 0;
689 }
690
691 return ret;
692 }
693
694 /**
695 * Async device support
696 */
697 static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
698 {
699 struct vfio_device *device;
700
701 /* Do we already know about it? We shouldn't */
702 device = vfio_group_get_device(group, dev);
703 if (WARN_ON_ONCE(device)) {
704 vfio_device_put(device);
705 return 0;
706 }
707
708 /* Nothing to do for idle groups */
709 if (!atomic_read(&group->container_users))
710 return 0;
711
712 /* TODO Prevent device auto probing */
713 WARN(1, "Device %s added to live group %d!\n", dev_name(dev),
714 iommu_group_id(group->iommu_group));
715
716 return 0;
717 }
718
719 static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
720 {
721 /* We don't care what happens when the group isn't in use */
722 if (!atomic_read(&group->container_users))
723 return 0;
724
725 return vfio_dev_viable(dev, group);
726 }
727
728 static int vfio_iommu_group_notifier(struct notifier_block *nb,
729 unsigned long action, void *data)
730 {
731 struct vfio_group *group = container_of(nb, struct vfio_group, nb);
732 struct device *dev = data;
733 struct vfio_unbound_dev *unbound;
734
735 /*
736 * Need to go through a group_lock lookup to get a reference or we
737 * risk racing a group being removed. Ignore spurious notifies.
738 */
739 group = vfio_group_try_get(group);
740 if (!group)
741 return NOTIFY_OK;
742
743 switch (action) {
744 case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
745 vfio_group_nb_add_dev(group, dev);
746 break;
747 case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
748 /*
749 * Nothing to do here. If the device is in use, then the
750 * vfio sub-driver should block the remove callback until
751 * it is unused. If the device is unused or attached to a
752 * stub driver, then it should be released and we don't
753 * care that it will be going away.
754 */
755 break;
756 case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
757 pr_debug("%s: Device %s, group %d binding to driver\n",
758 __func__, dev_name(dev),
759 iommu_group_id(group->iommu_group));
760 break;
761 case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
762 pr_debug("%s: Device %s, group %d bound to driver %s\n",
763 __func__, dev_name(dev),
764 iommu_group_id(group->iommu_group), dev->driver->name);
765 BUG_ON(vfio_group_nb_verify(group, dev));
766 break;
767 case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
768 pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
769 __func__, dev_name(dev),
770 iommu_group_id(group->iommu_group), dev->driver->name);
771 break;
772 case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
773 pr_debug("%s: Device %s, group %d unbound from driver\n",
774 __func__, dev_name(dev),
775 iommu_group_id(group->iommu_group));
776 /*
777 * XXX An unbound device in a live group is ok, but we'd
778 * really like to avoid the above BUG_ON by preventing other
779 * drivers from binding to it. Once that occurs, we have to
780 * stop the system to maintain isolation. At a minimum, we'd
781 * want a toggle to disable driver auto probe for this device.
782 */
783
784 mutex_lock(&group->unbound_lock);
785 list_for_each_entry(unbound,
786 &group->unbound_list, unbound_next) {
787 if (dev == unbound->dev) {
788 list_del(&unbound->unbound_next);
789 kfree(unbound);
790 break;
791 }
792 }
793 mutex_unlock(&group->unbound_lock);
794 break;
795 }
796
797 /*
798 * If we're the last reference to the group, the group will be
799 * released, which includes unregistering the iommu group notifier.
800 * We hold a read-lock on that notifier list, unregistering needs
801 * a write-lock... deadlock. Release our reference asynchronously
802 * to avoid that situation.
803 */
804 vfio_group_schedule_put(group);
805 return NOTIFY_OK;
806 }
807
808 /**
809 * VFIO driver API
810 */
811 int vfio_add_group_dev(struct device *dev,
812 const struct vfio_device_ops *ops, void *device_data)
813 {
814 struct iommu_group *iommu_group;
815 struct vfio_group *group;
816 struct vfio_device *device;
817
818 iommu_group = iommu_group_get(dev);
819 if (!iommu_group)
820 return -EINVAL;
821
822 group = vfio_group_get_from_iommu(iommu_group);
823 if (!group) {
824 group = vfio_create_group(iommu_group);
825 if (IS_ERR(group)) {
826 iommu_group_put(iommu_group);
827 return PTR_ERR(group);
828 }
829 } else {
830 /*
831 * A found vfio_group already holds a reference to the
832 * iommu_group. A created vfio_group keeps the reference.
833 */
834 iommu_group_put(iommu_group);
835 }
836
837 device = vfio_group_get_device(group, dev);
838 if (device) {
839 WARN(1, "Device %s already exists on group %d\n",
840 dev_name(dev), iommu_group_id(iommu_group));
841 vfio_device_put(device);
842 vfio_group_put(group);
843 return -EBUSY;
844 }
845
846 device = vfio_group_create_device(group, dev, ops, device_data);
847 if (IS_ERR(device)) {
848 vfio_group_put(group);
849 return PTR_ERR(device);
850 }
851
852 /*
853 * Drop all but the vfio_device reference. The vfio_device holds
854 * a reference to the vfio_group, which holds a reference to the
855 * iommu_group.
856 */
857 vfio_group_put(group);
858
859 return 0;
860 }
861 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
862
863 /**
864 * Get a reference to the vfio_device for a device. Even if the
865 * caller thinks they own the device, they could be racing with a
866 * release call path, so we can't trust drvdata for the shortcut.
867 * Go the long way around, from the iommu_group to the vfio_group
868 * to the vfio_device.
869 */
870 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
871 {
872 struct vfio_group *group;
873 struct vfio_device *device;
874
875 group = vfio_group_get_from_dev(dev);
876 if (!group)
877 return NULL;
878
879 device = vfio_group_get_device(group, dev);
880 vfio_group_put(group);
881
882 return device;
883 }
884 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
885
886 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
887 char *buf)
888 {
889 struct vfio_device *it, *device = NULL;
890
891 mutex_lock(&group->device_lock);
892 list_for_each_entry(it, &group->device_list, group_next) {
893 if (!strcmp(dev_name(it->dev), buf)) {
894 device = it;
895 vfio_device_get(device);
896 break;
897 }
898 }
899 mutex_unlock(&group->device_lock);
900
901 return device;
902 }
903
904 /*
905 * Caller must hold a reference to the vfio_device
906 */
907 void *vfio_device_data(struct vfio_device *device)
908 {
909 return device->device_data;
910 }
911 EXPORT_SYMBOL_GPL(vfio_device_data);
912
913 /*
914 * Decrement the device reference count and wait for the device to be
915 * removed. Open file descriptors for the device... */
916 void *vfio_del_group_dev(struct device *dev)
917 {
918 DEFINE_WAIT_FUNC(wait, woken_wake_function);
919 struct vfio_device *device = dev_get_drvdata(dev);
920 struct vfio_group *group = device->group;
921 void *device_data = device->device_data;
922 struct vfio_unbound_dev *unbound;
923 unsigned int i = 0;
924 bool interrupted = false;
925
926 /*
927 * The group exists so long as we have a device reference. Get
928 * a group reference and use it to scan for the device going away.
929 */
930 vfio_group_get(group);
931
932 /*
933 * When the device is removed from the group, the group suddenly
934 * becomes non-viable; the device has a driver (until the unbind
935 * completes), but it's not present in the group. This is bad news
936 * for any external users that need to re-acquire a group reference
937 * in order to match and release their existing reference. To
938 * solve this, we track such devices on the unbound_list to bridge
939 * the gap until they're fully unbound.
940 */
941 unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
942 if (unbound) {
943 unbound->dev = dev;
944 mutex_lock(&group->unbound_lock);
945 list_add(&unbound->unbound_next, &group->unbound_list);
946 mutex_unlock(&group->unbound_lock);
947 }
948 WARN_ON(!unbound);
949
950 vfio_device_put(device);
951
952 /*
953 * If the device is still present in the group after the above
954 * 'put', then it is in use and we need to request it from the
955 * bus driver. The driver may in turn need to request the
956 * device from the user. We send the request on an arbitrary
957 * interval with counter to allow the driver to take escalating
958 * measures to release the device if it has the ability to do so.
959 */
960 add_wait_queue(&vfio.release_q, &wait);
961
962 do {
963 device = vfio_group_get_device(group, dev);
964 if (!device)
965 break;
966
967 if (device->ops->request)
968 device->ops->request(device_data, i++);
969
970 vfio_device_put(device);
971
972 if (interrupted) {
973 wait_woken(&wait, TASK_UNINTERRUPTIBLE, HZ * 10);
974 } else {
975 wait_woken(&wait, TASK_INTERRUPTIBLE, HZ * 10);
976 if (signal_pending(current)) {
977 interrupted = true;
978 dev_warn(dev,
979 "Device is currently in use, task"
980 " \"%s\" (%d) "
981 "blocked until device is released",
982 current->comm, task_pid_nr(current));
983 }
984 }
985
986 } while (1);
987
988 remove_wait_queue(&vfio.release_q, &wait);
989 /*
990 * In order to support multiple devices per group, devices can be
991 * plucked from the group while other devices in the group are still
992 * in use. The container persists with this group and those remaining
993 * devices still attached. If the user creates an isolation violation
994 * by binding this device to another driver while the group is still in
995 * use, that's their fault. However, in the case of removing the last,
996 * or potentially the only, device in the group there can be no other
997 * in-use devices in the group. The user has done their due diligence
998 * and we should lay no claims to those devices. In order to do that,
999 * we need to make sure the group is detached from the container.
1000 * Without this stall, we're potentially racing with a user process
1001 * that may attempt to immediately bind this device to another driver.
1002 */
1003 if (list_empty(&group->device_list))
1004 wait_event(group->container_q, !group->container);
1005
1006 vfio_group_put(group);
1007
1008 return device_data;
1009 }
1010 EXPORT_SYMBOL_GPL(vfio_del_group_dev);
1011
1012 /**
1013 * VFIO base fd, /dev/vfio/vfio
1014 */
1015 static long vfio_ioctl_check_extension(struct vfio_container *container,
1016 unsigned long arg)
1017 {
1018 struct vfio_iommu_driver *driver;
1019 long ret = 0;
1020
1021 down_read(&container->group_lock);
1022
1023 driver = container->iommu_driver;
1024
1025 switch (arg) {
1026 /* No base extensions yet */
1027 default:
1028 /*
1029 * If no driver is set, poll all registered drivers for
1030 * extensions and return the first positive result. If
1031 * a driver is already set, further queries will be passed
1032 * only to that driver.
1033 */
1034 if (!driver) {
1035 mutex_lock(&vfio.iommu_drivers_lock);
1036 list_for_each_entry(driver, &vfio.iommu_drivers_list,
1037 vfio_next) {
1038
1039 #ifdef CONFIG_VFIO_NOIOMMU
1040 if (!list_empty(&container->group_list) &&
1041 (container->noiommu !=
1042 (driver->ops == &vfio_noiommu_ops)))
1043 continue;
1044 #endif
1045
1046 if (!try_module_get(driver->ops->owner))
1047 continue;
1048
1049 ret = driver->ops->ioctl(NULL,
1050 VFIO_CHECK_EXTENSION,
1051 arg);
1052 module_put(driver->ops->owner);
1053 if (ret > 0)
1054 break;
1055 }
1056 mutex_unlock(&vfio.iommu_drivers_lock);
1057 } else
1058 ret = driver->ops->ioctl(container->iommu_data,
1059 VFIO_CHECK_EXTENSION, arg);
1060 }
1061
1062 up_read(&container->group_lock);
1063
1064 return ret;
1065 }
1066
1067 /* hold write lock on container->group_lock */
1068 static int __vfio_container_attach_groups(struct vfio_container *container,
1069 struct vfio_iommu_driver *driver,
1070 void *data)
1071 {
1072 struct vfio_group *group;
1073 int ret = -ENODEV;
1074
1075 list_for_each_entry(group, &container->group_list, container_next) {
1076 ret = driver->ops->attach_group(data, group->iommu_group);
1077 if (ret)
1078 goto unwind;
1079 }
1080
1081 return ret;
1082
1083 unwind:
1084 list_for_each_entry_continue_reverse(group, &container->group_list,
1085 container_next) {
1086 driver->ops->detach_group(data, group->iommu_group);
1087 }
1088
1089 return ret;
1090 }
1091
1092 static long vfio_ioctl_set_iommu(struct vfio_container *container,
1093 unsigned long arg)
1094 {
1095 struct vfio_iommu_driver *driver;
1096 long ret = -ENODEV;
1097
1098 down_write(&container->group_lock);
1099
1100 /*
1101 * The container is designed to be an unprivileged interface while
1102 * the group can be assigned to specific users. Therefore, only by
1103 * adding a group to a container does the user get the privilege of
1104 * enabling the iommu, which may allocate finite resources. There
1105 * is no unset_iommu, but by removing all the groups from a container,
1106 * the container is deprivileged and returns to an unset state.
1107 */
1108 if (list_empty(&container->group_list) || container->iommu_driver) {
1109 up_write(&container->group_lock);
1110 return -EINVAL;
1111 }
1112
1113 mutex_lock(&vfio.iommu_drivers_lock);
1114 list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1115 void *data;
1116
1117 #ifdef CONFIG_VFIO_NOIOMMU
1118 /*
1119 * Only noiommu containers can use vfio-noiommu and noiommu
1120 * containers can only use vfio-noiommu.
1121 */
1122 if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1123 continue;
1124 #endif
1125
1126 if (!try_module_get(driver->ops->owner))
1127 continue;
1128
1129 /*
1130 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1131 * so test which iommu driver reported support for this
1132 * extension and call open on them. We also pass them the
1133 * magic, allowing a single driver to support multiple
1134 * interfaces if they'd like.
1135 */
1136 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1137 module_put(driver->ops->owner);
1138 continue;
1139 }
1140
1141 data = driver->ops->open(arg);
1142 if (IS_ERR(data)) {
1143 ret = PTR_ERR(data);
1144 module_put(driver->ops->owner);
1145 continue;
1146 }
1147
1148 ret = __vfio_container_attach_groups(container, driver, data);
1149 if (ret) {
1150 driver->ops->release(data);
1151 module_put(driver->ops->owner);
1152 continue;
1153 }
1154
1155 container->iommu_driver = driver;
1156 container->iommu_data = data;
1157 break;
1158 }
1159
1160 mutex_unlock(&vfio.iommu_drivers_lock);
1161 up_write(&container->group_lock);
1162
1163 return ret;
1164 }
1165
1166 static long vfio_fops_unl_ioctl(struct file *filep,
1167 unsigned int cmd, unsigned long arg)
1168 {
1169 struct vfio_container *container = filep->private_data;
1170 struct vfio_iommu_driver *driver;
1171 void *data;
1172 long ret = -EINVAL;
1173
1174 if (!container)
1175 return ret;
1176
1177 switch (cmd) {
1178 case VFIO_GET_API_VERSION:
1179 ret = VFIO_API_VERSION;
1180 break;
1181 case VFIO_CHECK_EXTENSION:
1182 ret = vfio_ioctl_check_extension(container, arg);
1183 break;
1184 case VFIO_SET_IOMMU:
1185 ret = vfio_ioctl_set_iommu(container, arg);
1186 break;
1187 default:
1188 driver = container->iommu_driver;
1189 data = container->iommu_data;
1190
1191 if (driver) /* passthrough all unrecognized ioctls */
1192 ret = driver->ops->ioctl(data, cmd, arg);
1193 }
1194
1195 return ret;
1196 }
1197
1198 #ifdef CONFIG_COMPAT
1199 static long vfio_fops_compat_ioctl(struct file *filep,
1200 unsigned int cmd, unsigned long arg)
1201 {
1202 arg = (unsigned long)compat_ptr(arg);
1203 return vfio_fops_unl_ioctl(filep, cmd, arg);
1204 }
1205 #endif /* CONFIG_COMPAT */
1206
1207 static int vfio_fops_open(struct inode *inode, struct file *filep)
1208 {
1209 struct vfio_container *container;
1210
1211 container = kzalloc(sizeof(*container), GFP_KERNEL);
1212 if (!container)
1213 return -ENOMEM;
1214
1215 INIT_LIST_HEAD(&container->group_list);
1216 init_rwsem(&container->group_lock);
1217 kref_init(&container->kref);
1218
1219 filep->private_data = container;
1220
1221 return 0;
1222 }
1223
1224 static int vfio_fops_release(struct inode *inode, struct file *filep)
1225 {
1226 struct vfio_container *container = filep->private_data;
1227
1228 filep->private_data = NULL;
1229
1230 vfio_container_put(container);
1231
1232 return 0;
1233 }
1234
1235 /*
1236 * Once an iommu driver is set, we optionally pass read/write/mmap
1237 * on to the driver, allowing management interfaces beyond ioctl.
1238 */
1239 static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1240 size_t count, loff_t *ppos)
1241 {
1242 struct vfio_container *container = filep->private_data;
1243 struct vfio_iommu_driver *driver;
1244 ssize_t ret = -EINVAL;
1245
1246 driver = container->iommu_driver;
1247 if (likely(driver && driver->ops->read))
1248 ret = driver->ops->read(container->iommu_data,
1249 buf, count, ppos);
1250
1251 return ret;
1252 }
1253
1254 static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1255 size_t count, loff_t *ppos)
1256 {
1257 struct vfio_container *container = filep->private_data;
1258 struct vfio_iommu_driver *driver;
1259 ssize_t ret = -EINVAL;
1260
1261 driver = container->iommu_driver;
1262 if (likely(driver && driver->ops->write))
1263 ret = driver->ops->write(container->iommu_data,
1264 buf, count, ppos);
1265
1266 return ret;
1267 }
1268
1269 static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1270 {
1271 struct vfio_container *container = filep->private_data;
1272 struct vfio_iommu_driver *driver;
1273 int ret = -EINVAL;
1274
1275 driver = container->iommu_driver;
1276 if (likely(driver && driver->ops->mmap))
1277 ret = driver->ops->mmap(container->iommu_data, vma);
1278
1279 return ret;
1280 }
1281
1282 static const struct file_operations vfio_fops = {
1283 .owner = THIS_MODULE,
1284 .open = vfio_fops_open,
1285 .release = vfio_fops_release,
1286 .read = vfio_fops_read,
1287 .write = vfio_fops_write,
1288 .unlocked_ioctl = vfio_fops_unl_ioctl,
1289 #ifdef CONFIG_COMPAT
1290 .compat_ioctl = vfio_fops_compat_ioctl,
1291 #endif
1292 .mmap = vfio_fops_mmap,
1293 };
1294
1295 /**
1296 * VFIO Group fd, /dev/vfio/$GROUP
1297 */
1298 static void __vfio_group_unset_container(struct vfio_group *group)
1299 {
1300 struct vfio_container *container = group->container;
1301 struct vfio_iommu_driver *driver;
1302
1303 down_write(&container->group_lock);
1304
1305 driver = container->iommu_driver;
1306 if (driver)
1307 driver->ops->detach_group(container->iommu_data,
1308 group->iommu_group);
1309
1310 group->container = NULL;
1311 wake_up(&group->container_q);
1312 list_del(&group->container_next);
1313
1314 /* Detaching the last group deprivileges a container, remove iommu */
1315 if (driver && list_empty(&container->group_list)) {
1316 driver->ops->release(container->iommu_data);
1317 module_put(driver->ops->owner);
1318 container->iommu_driver = NULL;
1319 container->iommu_data = NULL;
1320 }
1321
1322 up_write(&container->group_lock);
1323
1324 vfio_container_put(container);
1325 }
1326
1327 /*
1328 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1329 * if there was no container to unset. Since the ioctl is called on
1330 * the group, we know that still exists, therefore the only valid
1331 * transition here is 1->0.
1332 */
1333 static int vfio_group_unset_container(struct vfio_group *group)
1334 {
1335 int users = atomic_cmpxchg(&group->container_users, 1, 0);
1336
1337 if (!users)
1338 return -EINVAL;
1339 if (users != 1)
1340 return -EBUSY;
1341
1342 __vfio_group_unset_container(group);
1343
1344 return 0;
1345 }
1346
1347 /*
1348 * When removing container users, anything that removes the last user
1349 * implicitly removes the group from the container. That is, if the
1350 * group file descriptor is closed, as well as any device file descriptors,
1351 * the group is free.
1352 */
1353 static void vfio_group_try_dissolve_container(struct vfio_group *group)
1354 {
1355 if (0 == atomic_dec_if_positive(&group->container_users))
1356 __vfio_group_unset_container(group);
1357 }
1358
1359 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1360 {
1361 struct fd f;
1362 struct vfio_container *container;
1363 struct vfio_iommu_driver *driver;
1364 int ret = 0;
1365
1366 if (atomic_read(&group->container_users))
1367 return -EINVAL;
1368
1369 if (group->noiommu && !capable(CAP_SYS_RAWIO))
1370 return -EPERM;
1371
1372 f = fdget(container_fd);
1373 if (!f.file)
1374 return -EBADF;
1375
1376 /* Sanity check, is this really our fd? */
1377 if (f.file->f_op != &vfio_fops) {
1378 fdput(f);
1379 return -EINVAL;
1380 }
1381
1382 container = f.file->private_data;
1383 WARN_ON(!container); /* fget ensures we don't race vfio_release */
1384
1385 down_write(&container->group_lock);
1386
1387 /* Real groups and fake groups cannot mix */
1388 if (!list_empty(&container->group_list) &&
1389 container->noiommu != group->noiommu) {
1390 ret = -EPERM;
1391 goto unlock_out;
1392 }
1393
1394 driver = container->iommu_driver;
1395 if (driver) {
1396 ret = driver->ops->attach_group(container->iommu_data,
1397 group->iommu_group);
1398 if (ret)
1399 goto unlock_out;
1400 }
1401
1402 group->container = container;
1403 container->noiommu = group->noiommu;
1404 list_add(&group->container_next, &container->group_list);
1405
1406 /* Get a reference on the container and mark a user within the group */
1407 vfio_container_get(container);
1408 atomic_inc(&group->container_users);
1409
1410 unlock_out:
1411 up_write(&container->group_lock);
1412 fdput(f);
1413 return ret;
1414 }
1415
1416 static bool vfio_group_viable(struct vfio_group *group)
1417 {
1418 return (iommu_group_for_each_dev(group->iommu_group,
1419 group, vfio_dev_viable) == 0);
1420 }
1421
1422 static int vfio_group_add_container_user(struct vfio_group *group)
1423 {
1424 if (!atomic_inc_not_zero(&group->container_users))
1425 return -EINVAL;
1426
1427 if (group->noiommu) {
1428 atomic_dec(&group->container_users);
1429 return -EPERM;
1430 }
1431 if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1432 atomic_dec(&group->container_users);
1433 return -EINVAL;
1434 }
1435
1436 return 0;
1437 }
1438
1439 static const struct file_operations vfio_device_fops;
1440
1441 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1442 {
1443 struct vfio_device *device;
1444 struct file *filep;
1445 int ret;
1446
1447 if (0 == atomic_read(&group->container_users) ||
1448 !group->container->iommu_driver || !vfio_group_viable(group))
1449 return -EINVAL;
1450
1451 if (group->noiommu && !capable(CAP_SYS_RAWIO))
1452 return -EPERM;
1453
1454 device = vfio_device_get_from_name(group, buf);
1455 if (!device)
1456 return -ENODEV;
1457
1458 ret = device->ops->open(device->device_data);
1459 if (ret) {
1460 vfio_device_put(device);
1461 return ret;
1462 }
1463
1464 /*
1465 * We can't use anon_inode_getfd() because we need to modify
1466 * the f_mode flags directly to allow more than just ioctls
1467 */
1468 ret = get_unused_fd_flags(O_CLOEXEC);
1469 if (ret < 0) {
1470 device->ops->release(device->device_data);
1471 vfio_device_put(device);
1472 return ret;
1473 }
1474
1475 filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1476 device, O_RDWR);
1477 if (IS_ERR(filep)) {
1478 put_unused_fd(ret);
1479 ret = PTR_ERR(filep);
1480 device->ops->release(device->device_data);
1481 vfio_device_put(device);
1482 return ret;
1483 }
1484
1485 /*
1486 * TODO: add an anon_inode interface to do this.
1487 * Appears to be missing by lack of need rather than
1488 * explicitly prevented. Now there's need.
1489 */
1490 filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1491
1492 atomic_inc(&group->container_users);
1493
1494 fd_install(ret, filep);
1495
1496 if (group->noiommu)
1497 dev_warn(device->dev, "vfio-noiommu device opened by user "
1498 "(%s:%d)\n", current->comm, task_pid_nr(current));
1499
1500 return ret;
1501 }
1502
1503 static long vfio_group_fops_unl_ioctl(struct file *filep,
1504 unsigned int cmd, unsigned long arg)
1505 {
1506 struct vfio_group *group = filep->private_data;
1507 long ret = -ENOTTY;
1508
1509 switch (cmd) {
1510 case VFIO_GROUP_GET_STATUS:
1511 {
1512 struct vfio_group_status status;
1513 unsigned long minsz;
1514
1515 minsz = offsetofend(struct vfio_group_status, flags);
1516
1517 if (copy_from_user(&status, (void __user *)arg, minsz))
1518 return -EFAULT;
1519
1520 if (status.argsz < minsz)
1521 return -EINVAL;
1522
1523 status.flags = 0;
1524
1525 if (vfio_group_viable(group))
1526 status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1527
1528 if (group->container)
1529 status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1530
1531 if (copy_to_user((void __user *)arg, &status, minsz))
1532 return -EFAULT;
1533
1534 ret = 0;
1535 break;
1536 }
1537 case VFIO_GROUP_SET_CONTAINER:
1538 {
1539 int fd;
1540
1541 if (get_user(fd, (int __user *)arg))
1542 return -EFAULT;
1543
1544 if (fd < 0)
1545 return -EINVAL;
1546
1547 ret = vfio_group_set_container(group, fd);
1548 break;
1549 }
1550 case VFIO_GROUP_UNSET_CONTAINER:
1551 ret = vfio_group_unset_container(group);
1552 break;
1553 case VFIO_GROUP_GET_DEVICE_FD:
1554 {
1555 char *buf;
1556
1557 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1558 if (IS_ERR(buf))
1559 return PTR_ERR(buf);
1560
1561 ret = vfio_group_get_device_fd(group, buf);
1562 kfree(buf);
1563 break;
1564 }
1565 }
1566
1567 return ret;
1568 }
1569
1570 #ifdef CONFIG_COMPAT
1571 static long vfio_group_fops_compat_ioctl(struct file *filep,
1572 unsigned int cmd, unsigned long arg)
1573 {
1574 arg = (unsigned long)compat_ptr(arg);
1575 return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1576 }
1577 #endif /* CONFIG_COMPAT */
1578
1579 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1580 {
1581 struct vfio_group *group;
1582 int opened;
1583
1584 group = vfio_group_get_from_minor(iminor(inode));
1585 if (!group)
1586 return -ENODEV;
1587
1588 if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1589 vfio_group_put(group);
1590 return -EPERM;
1591 }
1592
1593 /* Do we need multiple instances of the group open? Seems not. */
1594 opened = atomic_cmpxchg(&group->opened, 0, 1);
1595 if (opened) {
1596 vfio_group_put(group);
1597 return -EBUSY;
1598 }
1599
1600 /* Is something still in use from a previous open? */
1601 if (group->container) {
1602 atomic_dec(&group->opened);
1603 vfio_group_put(group);
1604 return -EBUSY;
1605 }
1606
1607 /* Warn if previous user didn't cleanup and re-init to drop them */
1608 if (WARN_ON(group->notifier.head))
1609 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1610
1611 filep->private_data = group;
1612
1613 return 0;
1614 }
1615
1616 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1617 {
1618 struct vfio_group *group = filep->private_data;
1619
1620 filep->private_data = NULL;
1621
1622 vfio_group_try_dissolve_container(group);
1623
1624 atomic_dec(&group->opened);
1625
1626 vfio_group_put(group);
1627
1628 return 0;
1629 }
1630
1631 static const struct file_operations vfio_group_fops = {
1632 .owner = THIS_MODULE,
1633 .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1634 #ifdef CONFIG_COMPAT
1635 .compat_ioctl = vfio_group_fops_compat_ioctl,
1636 #endif
1637 .open = vfio_group_fops_open,
1638 .release = vfio_group_fops_release,
1639 };
1640
1641 /**
1642 * VFIO Device fd
1643 */
1644 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1645 {
1646 struct vfio_device *device = filep->private_data;
1647
1648 device->ops->release(device->device_data);
1649
1650 vfio_group_try_dissolve_container(device->group);
1651
1652 vfio_device_put(device);
1653
1654 return 0;
1655 }
1656
1657 static long vfio_device_fops_unl_ioctl(struct file *filep,
1658 unsigned int cmd, unsigned long arg)
1659 {
1660 struct vfio_device *device = filep->private_data;
1661
1662 if (unlikely(!device->ops->ioctl))
1663 return -EINVAL;
1664
1665 return device->ops->ioctl(device->device_data, cmd, arg);
1666 }
1667
1668 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1669 size_t count, loff_t *ppos)
1670 {
1671 struct vfio_device *device = filep->private_data;
1672
1673 if (unlikely(!device->ops->read))
1674 return -EINVAL;
1675
1676 return device->ops->read(device->device_data, buf, count, ppos);
1677 }
1678
1679 static ssize_t vfio_device_fops_write(struct file *filep,
1680 const char __user *buf,
1681 size_t count, loff_t *ppos)
1682 {
1683 struct vfio_device *device = filep->private_data;
1684
1685 if (unlikely(!device->ops->write))
1686 return -EINVAL;
1687
1688 return device->ops->write(device->device_data, buf, count, ppos);
1689 }
1690
1691 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1692 {
1693 struct vfio_device *device = filep->private_data;
1694
1695 if (unlikely(!device->ops->mmap))
1696 return -EINVAL;
1697
1698 return device->ops->mmap(device->device_data, vma);
1699 }
1700
1701 #ifdef CONFIG_COMPAT
1702 static long vfio_device_fops_compat_ioctl(struct file *filep,
1703 unsigned int cmd, unsigned long arg)
1704 {
1705 arg = (unsigned long)compat_ptr(arg);
1706 return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1707 }
1708 #endif /* CONFIG_COMPAT */
1709
1710 static const struct file_operations vfio_device_fops = {
1711 .owner = THIS_MODULE,
1712 .release = vfio_device_fops_release,
1713 .read = vfio_device_fops_read,
1714 .write = vfio_device_fops_write,
1715 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1716 #ifdef CONFIG_COMPAT
1717 .compat_ioctl = vfio_device_fops_compat_ioctl,
1718 #endif
1719 .mmap = vfio_device_fops_mmap,
1720 };
1721
1722 /**
1723 * External user API, exported by symbols to be linked dynamically.
1724 *
1725 * The protocol includes:
1726 * 1. do normal VFIO init operation:
1727 * - opening a new container;
1728 * - attaching group(s) to it;
1729 * - setting an IOMMU driver for a container.
1730 * When IOMMU is set for a container, all groups in it are
1731 * considered ready to use by an external user.
1732 *
1733 * 2. User space passes a group fd to an external user.
1734 * The external user calls vfio_group_get_external_user()
1735 * to verify that:
1736 * - the group is initialized;
1737 * - IOMMU is set for it.
1738 * If both checks passed, vfio_group_get_external_user()
1739 * increments the container user counter to prevent
1740 * the VFIO group from disposal before KVM exits.
1741 *
1742 * 3. The external user calls vfio_external_user_iommu_id()
1743 * to know an IOMMU ID.
1744 *
1745 * 4. When the external KVM finishes, it calls
1746 * vfio_group_put_external_user() to release the VFIO group.
1747 * This call decrements the container user counter.
1748 */
1749 struct vfio_group *vfio_group_get_external_user(struct file *filep)
1750 {
1751 struct vfio_group *group = filep->private_data;
1752 int ret;
1753
1754 if (filep->f_op != &vfio_group_fops)
1755 return ERR_PTR(-EINVAL);
1756
1757 ret = vfio_group_add_container_user(group);
1758 if (ret)
1759 return ERR_PTR(ret);
1760
1761 vfio_group_get(group);
1762
1763 return group;
1764 }
1765 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1766
1767 void vfio_group_put_external_user(struct vfio_group *group)
1768 {
1769 vfio_group_try_dissolve_container(group);
1770 vfio_group_put(group);
1771 }
1772 EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1773
1774 bool vfio_external_group_match_file(struct vfio_group *test_group,
1775 struct file *filep)
1776 {
1777 struct vfio_group *group = filep->private_data;
1778
1779 return (filep->f_op == &vfio_group_fops) && (group == test_group);
1780 }
1781 EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1782
1783 int vfio_external_user_iommu_id(struct vfio_group *group)
1784 {
1785 return iommu_group_id(group->iommu_group);
1786 }
1787 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1788
1789 long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1790 {
1791 return vfio_ioctl_check_extension(group->container, arg);
1792 }
1793 EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1794
1795 /**
1796 * Sub-module support
1797 */
1798 /*
1799 * Helper for managing a buffer of info chain capabilities, allocate or
1800 * reallocate a buffer with additional @size, filling in @id and @version
1801 * of the capability. A pointer to the new capability is returned.
1802 *
1803 * NB. The chain is based at the head of the buffer, so new entries are
1804 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1805 * next offsets prior to copying to the user buffer.
1806 */
1807 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1808 size_t size, u16 id, u16 version)
1809 {
1810 void *buf;
1811 struct vfio_info_cap_header *header, *tmp;
1812
1813 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1814 if (!buf) {
1815 kfree(caps->buf);
1816 caps->size = 0;
1817 return ERR_PTR(-ENOMEM);
1818 }
1819
1820 caps->buf = buf;
1821 header = buf + caps->size;
1822
1823 /* Eventually copied to user buffer, zero */
1824 memset(header, 0, size);
1825
1826 header->id = id;
1827 header->version = version;
1828
1829 /* Add to the end of the capability chain */
1830 for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1831 ; /* nothing */
1832
1833 tmp->next = caps->size;
1834 caps->size += size;
1835
1836 return header;
1837 }
1838 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1839
1840 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1841 {
1842 struct vfio_info_cap_header *tmp;
1843 void *buf = (void *)caps->buf;
1844
1845 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1846 tmp->next += offset;
1847 }
1848 EXPORT_SYMBOL(vfio_info_cap_shift);
1849
1850 static int sparse_mmap_cap(struct vfio_info_cap *caps, void *cap_type)
1851 {
1852 struct vfio_info_cap_header *header;
1853 struct vfio_region_info_cap_sparse_mmap *sparse_cap, *sparse = cap_type;
1854 size_t size;
1855
1856 size = sizeof(*sparse) + sparse->nr_areas * sizeof(*sparse->areas);
1857 header = vfio_info_cap_add(caps, size,
1858 VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1);
1859 if (IS_ERR(header))
1860 return PTR_ERR(header);
1861
1862 sparse_cap = container_of(header,
1863 struct vfio_region_info_cap_sparse_mmap, header);
1864 sparse_cap->nr_areas = sparse->nr_areas;
1865 memcpy(sparse_cap->areas, sparse->areas,
1866 sparse->nr_areas * sizeof(*sparse->areas));
1867 return 0;
1868 }
1869
1870 static int region_type_cap(struct vfio_info_cap *caps, void *cap_type)
1871 {
1872 struct vfio_info_cap_header *header;
1873 struct vfio_region_info_cap_type *type_cap, *cap = cap_type;
1874
1875 header = vfio_info_cap_add(caps, sizeof(*cap),
1876 VFIO_REGION_INFO_CAP_TYPE, 1);
1877 if (IS_ERR(header))
1878 return PTR_ERR(header);
1879
1880 type_cap = container_of(header, struct vfio_region_info_cap_type,
1881 header);
1882 type_cap->type = cap->type;
1883 type_cap->subtype = cap->subtype;
1884 return 0;
1885 }
1886
1887 int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id,
1888 void *cap_type)
1889 {
1890 int ret = -EINVAL;
1891
1892 if (!cap_type)
1893 return 0;
1894
1895 switch (cap_type_id) {
1896 case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
1897 ret = sparse_mmap_cap(caps, cap_type);
1898 break;
1899
1900 case VFIO_REGION_INFO_CAP_TYPE:
1901 ret = region_type_cap(caps, cap_type);
1902 break;
1903 }
1904
1905 return ret;
1906 }
1907 EXPORT_SYMBOL(vfio_info_add_capability);
1908
1909 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1910 int max_irq_type, size_t *data_size)
1911 {
1912 unsigned long minsz;
1913 size_t size;
1914
1915 minsz = offsetofend(struct vfio_irq_set, count);
1916
1917 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1918 (hdr->count >= (U32_MAX - hdr->start)) ||
1919 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1920 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1921 return -EINVAL;
1922
1923 if (data_size)
1924 *data_size = 0;
1925
1926 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1927 return -EINVAL;
1928
1929 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1930 case VFIO_IRQ_SET_DATA_NONE:
1931 size = 0;
1932 break;
1933 case VFIO_IRQ_SET_DATA_BOOL:
1934 size = sizeof(uint8_t);
1935 break;
1936 case VFIO_IRQ_SET_DATA_EVENTFD:
1937 size = sizeof(int32_t);
1938 break;
1939 default:
1940 return -EINVAL;
1941 }
1942
1943 if (size) {
1944 if (hdr->argsz - minsz < hdr->count * size)
1945 return -EINVAL;
1946
1947 if (!data_size)
1948 return -EINVAL;
1949
1950 *data_size = hdr->count * size;
1951 }
1952
1953 return 0;
1954 }
1955 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1956
1957 /*
1958 * Pin a set of guest PFNs and return their associated host PFNs for local
1959 * domain only.
1960 * @dev [in] : device
1961 * @user_pfn [in]: array of user/guest PFNs to be pinned.
1962 * @npage [in] : count of elements in user_pfn array. This count should not
1963 * be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1964 * @prot [in] : protection flags
1965 * @phys_pfn[out]: array of host PFNs
1966 * Return error or number of pages pinned.
1967 */
1968 int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1969 int prot, unsigned long *phys_pfn)
1970 {
1971 struct vfio_container *container;
1972 struct vfio_group *group;
1973 struct vfio_iommu_driver *driver;
1974 int ret;
1975
1976 if (!dev || !user_pfn || !phys_pfn || !npage)
1977 return -EINVAL;
1978
1979 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1980 return -E2BIG;
1981
1982 group = vfio_group_get_from_dev(dev);
1983 if (!group)
1984 return -ENODEV;
1985
1986 ret = vfio_group_add_container_user(group);
1987 if (ret)
1988 goto err_pin_pages;
1989
1990 container = group->container;
1991 driver = container->iommu_driver;
1992 if (likely(driver && driver->ops->pin_pages))
1993 ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
1994 npage, prot, phys_pfn);
1995 else
1996 ret = -ENOTTY;
1997
1998 vfio_group_try_dissolve_container(group);
1999
2000 err_pin_pages:
2001 vfio_group_put(group);
2002 return ret;
2003 }
2004 EXPORT_SYMBOL(vfio_pin_pages);
2005
2006 /*
2007 * Unpin set of host PFNs for local domain only.
2008 * @dev [in] : device
2009 * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
2010 * PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2011 * @npage [in] : count of elements in user_pfn array. This count should not
2012 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2013 * Return error or number of pages unpinned.
2014 */
2015 int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
2016 {
2017 struct vfio_container *container;
2018 struct vfio_group *group;
2019 struct vfio_iommu_driver *driver;
2020 int ret;
2021
2022 if (!dev || !user_pfn || !npage)
2023 return -EINVAL;
2024
2025 if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2026 return -E2BIG;
2027
2028 group = vfio_group_get_from_dev(dev);
2029 if (!group)
2030 return -ENODEV;
2031
2032 ret = vfio_group_add_container_user(group);
2033 if (ret)
2034 goto err_unpin_pages;
2035
2036 container = group->container;
2037 driver = container->iommu_driver;
2038 if (likely(driver && driver->ops->unpin_pages))
2039 ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
2040 npage);
2041 else
2042 ret = -ENOTTY;
2043
2044 vfio_group_try_dissolve_container(group);
2045
2046 err_unpin_pages:
2047 vfio_group_put(group);
2048 return ret;
2049 }
2050 EXPORT_SYMBOL(vfio_unpin_pages);
2051
2052 static int vfio_register_iommu_notifier(struct vfio_group *group,
2053 unsigned long *events,
2054 struct notifier_block *nb)
2055 {
2056 struct vfio_container *container;
2057 struct vfio_iommu_driver *driver;
2058 int ret;
2059
2060 ret = vfio_group_add_container_user(group);
2061 if (ret)
2062 return -EINVAL;
2063
2064 container = group->container;
2065 driver = container->iommu_driver;
2066 if (likely(driver && driver->ops->register_notifier))
2067 ret = driver->ops->register_notifier(container->iommu_data,
2068 events, nb);
2069 else
2070 ret = -ENOTTY;
2071
2072 vfio_group_try_dissolve_container(group);
2073
2074 return ret;
2075 }
2076
2077 static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2078 struct notifier_block *nb)
2079 {
2080 struct vfio_container *container;
2081 struct vfio_iommu_driver *driver;
2082 int ret;
2083
2084 ret = vfio_group_add_container_user(group);
2085 if (ret)
2086 return -EINVAL;
2087
2088 container = group->container;
2089 driver = container->iommu_driver;
2090 if (likely(driver && driver->ops->unregister_notifier))
2091 ret = driver->ops->unregister_notifier(container->iommu_data,
2092 nb);
2093 else
2094 ret = -ENOTTY;
2095
2096 vfio_group_try_dissolve_container(group);
2097
2098 return ret;
2099 }
2100
2101 void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2102 {
2103 group->kvm = kvm;
2104 blocking_notifier_call_chain(&group->notifier,
2105 VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2106 }
2107 EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2108
2109 static int vfio_register_group_notifier(struct vfio_group *group,
2110 unsigned long *events,
2111 struct notifier_block *nb)
2112 {
2113 int ret;
2114 bool set_kvm = false;
2115
2116 if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2117 set_kvm = true;
2118
2119 /* clear known events */
2120 *events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2121
2122 /* refuse to continue if still events remaining */
2123 if (*events)
2124 return -EINVAL;
2125
2126 ret = vfio_group_add_container_user(group);
2127 if (ret)
2128 return -EINVAL;
2129
2130 ret = blocking_notifier_chain_register(&group->notifier, nb);
2131
2132 /*
2133 * The attaching of kvm and vfio_group might already happen, so
2134 * here we replay once upon registration.
2135 */
2136 if (!ret && set_kvm && group->kvm)
2137 blocking_notifier_call_chain(&group->notifier,
2138 VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2139
2140 vfio_group_try_dissolve_container(group);
2141
2142 return ret;
2143 }
2144
2145 static int vfio_unregister_group_notifier(struct vfio_group *group,
2146 struct notifier_block *nb)
2147 {
2148 int ret;
2149
2150 ret = vfio_group_add_container_user(group);
2151 if (ret)
2152 return -EINVAL;
2153
2154 ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2155
2156 vfio_group_try_dissolve_container(group);
2157
2158 return ret;
2159 }
2160
2161 int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2162 unsigned long *events, struct notifier_block *nb)
2163 {
2164 struct vfio_group *group;
2165 int ret;
2166
2167 if (!dev || !nb || !events || (*events == 0))
2168 return -EINVAL;
2169
2170 group = vfio_group_get_from_dev(dev);
2171 if (!group)
2172 return -ENODEV;
2173
2174 switch (type) {
2175 case VFIO_IOMMU_NOTIFY:
2176 ret = vfio_register_iommu_notifier(group, events, nb);
2177 break;
2178 case VFIO_GROUP_NOTIFY:
2179 ret = vfio_register_group_notifier(group, events, nb);
2180 break;
2181 default:
2182 ret = -EINVAL;
2183 }
2184
2185 vfio_group_put(group);
2186 return ret;
2187 }
2188 EXPORT_SYMBOL(vfio_register_notifier);
2189
2190 int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2191 struct notifier_block *nb)
2192 {
2193 struct vfio_group *group;
2194 int ret;
2195
2196 if (!dev || !nb)
2197 return -EINVAL;
2198
2199 group = vfio_group_get_from_dev(dev);
2200 if (!group)
2201 return -ENODEV;
2202
2203 switch (type) {
2204 case VFIO_IOMMU_NOTIFY:
2205 ret = vfio_unregister_iommu_notifier(group, nb);
2206 break;
2207 case VFIO_GROUP_NOTIFY:
2208 ret = vfio_unregister_group_notifier(group, nb);
2209 break;
2210 default:
2211 ret = -EINVAL;
2212 }
2213
2214 vfio_group_put(group);
2215 return ret;
2216 }
2217 EXPORT_SYMBOL(vfio_unregister_notifier);
2218
2219 /**
2220 * Module/class support
2221 */
2222 static char *vfio_devnode(struct device *dev, umode_t *mode)
2223 {
2224 return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2225 }
2226
2227 static struct miscdevice vfio_dev = {
2228 .minor = VFIO_MINOR,
2229 .name = "vfio",
2230 .fops = &vfio_fops,
2231 .nodename = "vfio/vfio",
2232 .mode = S_IRUGO | S_IWUGO,
2233 };
2234
2235 static int __init vfio_init(void)
2236 {
2237 int ret;
2238
2239 idr_init(&vfio.group_idr);
2240 mutex_init(&vfio.group_lock);
2241 mutex_init(&vfio.iommu_drivers_lock);
2242 INIT_LIST_HEAD(&vfio.group_list);
2243 INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2244 init_waitqueue_head(&vfio.release_q);
2245
2246 ret = misc_register(&vfio_dev);
2247 if (ret) {
2248 pr_err("vfio: misc device register failed\n");
2249 return ret;
2250 }
2251
2252 /* /dev/vfio/$GROUP */
2253 vfio.class = class_create(THIS_MODULE, "vfio");
2254 if (IS_ERR(vfio.class)) {
2255 ret = PTR_ERR(vfio.class);
2256 goto err_class;
2257 }
2258
2259 vfio.class->devnode = vfio_devnode;
2260
2261 ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK, "vfio");
2262 if (ret)
2263 goto err_alloc_chrdev;
2264
2265 cdev_init(&vfio.group_cdev, &vfio_group_fops);
2266 ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK);
2267 if (ret)
2268 goto err_cdev_add;
2269
2270 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2271
2272 #ifdef CONFIG_VFIO_NOIOMMU
2273 vfio_register_iommu_driver(&vfio_noiommu_ops);
2274 #endif
2275 return 0;
2276
2277 err_cdev_add:
2278 unregister_chrdev_region(vfio.group_devt, MINORMASK);
2279 err_alloc_chrdev:
2280 class_destroy(vfio.class);
2281 vfio.class = NULL;
2282 err_class:
2283 misc_deregister(&vfio_dev);
2284 return ret;
2285 }
2286
2287 static void __exit vfio_cleanup(void)
2288 {
2289 WARN_ON(!list_empty(&vfio.group_list));
2290
2291 #ifdef CONFIG_VFIO_NOIOMMU
2292 vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2293 #endif
2294 idr_destroy(&vfio.group_idr);
2295 cdev_del(&vfio.group_cdev);
2296 unregister_chrdev_region(vfio.group_devt, MINORMASK);
2297 class_destroy(vfio.class);
2298 vfio.class = NULL;
2299 misc_deregister(&vfio_dev);
2300 }
2301
2302 module_init(vfio_init);
2303 module_exit(vfio_cleanup);
2304
2305 MODULE_VERSION(DRIVER_VERSION);
2306 MODULE_LICENSE("GPL v2");
2307 MODULE_AUTHOR(DRIVER_AUTHOR);
2308 MODULE_DESCRIPTION(DRIVER_DESC);
2309 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2310 MODULE_ALIAS("devname:vfio/vfio");
2311 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");