drivers/vfio/vfio.c

   1 /*
   2  * VFIO core
   3  *
   4  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   5  *     Author: Alex Williamson <alex.williamson@redhat.com>
   6  *
   7  * This program is free software; you can redistribute it and/or modify
   8  * it under the terms of the GNU General Public License version 2 as
   9  * published by the Free Software Foundation.
  10  *
  11  * Derived from original vfio:
  12  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  13  * Author: Tom Lyon, pugs@cisco.com
  14  */
  15
  16 #include <linux/cdev.h>
  17 #include <linux/compat.h>
  18 #include <linux/device.h>
  19 #include <linux/file.h>
  20 #include <linux/anon_inodes.h>
  21 #include <linux/fs.h>
  22 #include <linux/idr.h>
  23 #include <linux/iommu.h>
  24 #include <linux/list.h>
  25 #include <linux/miscdevice.h>
  26 #include <linux/module.h>
  27 #include <linux/mutex.h>
  28 #include <linux/pci.h>
  29 #include <linux/rwsem.h>
  30 #include <linux/sched.h>
  31 #include <linux/slab.h>
  32 #include <linux/stat.h>
  33 #include <linux/string.h>
  34 #include <linux/uaccess.h>
  35 #include <linux/vfio.h>
  36 #include <linux/wait.h>
  37 #include <linux/sched/signal.h>
  38
  39 #define DRIVER_VERSION  "0.3"
  40 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  41 #define DRIVER_DESC     "VFIO - User Level meta-driver"
  42
  43 static struct vfio {
  44         struct class                    *class;
  45         struct list_head                iommu_drivers_list;
  46         struct mutex                    iommu_drivers_lock;
  47         struct list_head                group_list;
  48         struct idr                      group_idr;
  49         struct mutex                    group_lock;
  50         struct cdev                     group_cdev;
  51         dev_t                           group_devt;
  52         wait_queue_head_t               release_q;
  53 } vfio;
  54
  55 struct vfio_iommu_driver {
  56         const struct vfio_iommu_driver_ops      *ops;
  57         struct list_head                        vfio_next;
  58 };
  59
  60 struct vfio_container {
  61         struct kref                     kref;
  62         struct list_head                group_list;
  63         struct rw_semaphore             group_lock;
  64         struct vfio_iommu_driver        *iommu_driver;
  65         void                            *iommu_data;
  66         bool                            noiommu;
  67 };
  68
  69 struct vfio_unbound_dev {
  70         struct device                   *dev;
  71         struct list_head                unbound_next;
  72 };
  73
  74 struct vfio_group {
  75         struct kref                     kref;
  76         int                             minor;
  77         atomic_t                        container_users;
  78         struct iommu_group              *iommu_group;
  79         struct vfio_container           *container;
  80         struct list_head                device_list;
  81         struct mutex                    device_lock;
  82         struct device                   *dev;
  83         struct notifier_block           nb;
  84         struct list_head                vfio_next;
  85         struct list_head                container_next;
  86         struct list_head                unbound_list;
  87         struct mutex                    unbound_lock;
  88         atomic_t                        opened;
  89         wait_queue_head_t               container_q;
  90         bool                            noiommu;
  91         struct kvm                      *kvm;
  92         struct blocking_notifier_head   notifier;
  93 };
  94
  95 struct vfio_device {
  96         struct kref                     kref;
  97         struct device                   *dev;
  98         const struct vfio_device_ops    *ops;
  99         struct vfio_group               *group;
 100         struct list_head                group_next;
 101         void                            *device_data;
 102 };
 103
 104 #ifdef CONFIG_VFIO_NOIOMMU
 105 static bool noiommu __read_mostly;
 106 module_param_named(enable_unsafe_noiommu_mode,
 107                    noiommu, bool, S_IRUGO | S_IWUSR);
 108 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
 109 #endif
 110
 111 /*
 112  * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
 113  * and remove functions, any use cases other than acquiring the first
 114  * reference for the purpose of calling vfio_add_group_dev() or removing
 115  * that symmetric reference after vfio_del_group_dev() should use the raw
 116  * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
 117  * removes the device from the dummy group and cannot be nested.
 118  */
 119 struct iommu_group *vfio_iommu_group_get(struct device *dev)
 120 {
 121         struct iommu_group *group;
 122         int __maybe_unused ret;
 123
 124         group = iommu_group_get(dev);
 125
 126 #ifdef CONFIG_VFIO_NOIOMMU
 127         /*
 128          * With noiommu enabled, an IOMMU group will be created for a device
 129          * that doesn't already have one and doesn't have an iommu_ops on their
 130          * bus.  We set iommudata simply to be able to identify these groups
 131          * as special use and for reclamation later.
 132          */
 133         if (group || !noiommu || iommu_present(dev->bus))
 134                 return group;
 135
 136         group = iommu_group_alloc();
 137         if (IS_ERR(group))
 138                 return NULL;
 139
 140         iommu_group_set_name(group, "vfio-noiommu");
 141         iommu_group_set_iommudata(group, &noiommu, NULL);
 142         ret = iommu_group_add_device(group, dev);
 143         if (ret) {
 144                 iommu_group_put(group);
 145                 return NULL;
 146         }
 147
 148         /*
 149          * Where to taint?  At this point we've added an IOMMU group for a
 150          * device that is not backed by iommu_ops, therefore any iommu_
 151          * callback using iommu_ops can legitimately Oops.  So, while we may
 152          * be about to give a DMA capable device to a user without IOMMU
 153          * protection, which is clearly taint-worthy, let's go ahead and do
 154          * it here.
 155          */
 156         add_taint(TAINT_USER, LOCKDEP_STILL_OK);
 157         dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
 158 #endif
 159
 160         return group;
 161 }
 162 EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
 163
 164 void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
 165 {
 166 #ifdef CONFIG_VFIO_NOIOMMU
 167         if (iommu_group_get_iommudata(group) == &noiommu)
 168                 iommu_group_remove_device(dev);
 169 #endif
 170
 171         iommu_group_put(group);
 172 }
 173 EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
 174
 175 #ifdef CONFIG_VFIO_NOIOMMU
 176 static void *vfio_noiommu_open(unsigned long arg)
 177 {
 178         if (arg != VFIO_NOIOMMU_IOMMU)
 179                 return ERR_PTR(-EINVAL);
 180         if (!capable(CAP_SYS_RAWIO))
 181                 return ERR_PTR(-EPERM);
 182
 183         return NULL;
 184 }
 185
 186 static void vfio_noiommu_release(void *iommu_data)
 187 {
 188 }
 189
 190 static long vfio_noiommu_ioctl(void *iommu_data,
 191                                unsigned int cmd, unsigned long arg)
 192 {
 193         if (cmd == VFIO_CHECK_EXTENSION)
 194                 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
 195
 196         return -ENOTTY;
 197 }
 198
 199 static int vfio_noiommu_attach_group(void *iommu_data,
 200                                      struct iommu_group *iommu_group)
 201 {
 202         return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
 203 }
 204
 205 static void vfio_noiommu_detach_group(void *iommu_data,
 206                                       struct iommu_group *iommu_group)
 207 {
 208 }
 209
 210 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
 211         .name = "vfio-noiommu",
 212         .owner = THIS_MODULE,
 213         .open = vfio_noiommu_open,
 214         .release = vfio_noiommu_release,
 215         .ioctl = vfio_noiommu_ioctl,
 216         .attach_group = vfio_noiommu_attach_group,
 217         .detach_group = vfio_noiommu_detach_group,
 218 };
 219 #endif
 220
 221
 222 /**
 223  * IOMMU driver registration
 224  */
 225 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 226 {
 227         struct vfio_iommu_driver *driver, *tmp;
 228
 229         driver = kzalloc(sizeof(*driver), GFP_KERNEL);
 230         if (!driver)
 231                 return -ENOMEM;
 232
 233         driver->ops = ops;
 234
 235         mutex_lock(&vfio.iommu_drivers_lock);
 236
 237         /* Check for duplicates */
 238         list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
 239                 if (tmp->ops == ops) {
 240                         mutex_unlock(&vfio.iommu_drivers_lock);
 241                         kfree(driver);
 242                         return -EINVAL;
 243                 }
 244         }
 245
 246         list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
 247
 248         mutex_unlock(&vfio.iommu_drivers_lock);
 249
 250         return 0;
 251 }
 252 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
 253
 254 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 255 {
 256         struct vfio_iommu_driver *driver;
 257
 258         mutex_lock(&vfio.iommu_drivers_lock);
 259         list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
 260                 if (driver->ops == ops) {
 261                         list_del(&driver->vfio_next);
 262                         mutex_unlock(&vfio.iommu_drivers_lock);
 263                         kfree(driver);
 264                         return;
 265                 }
 266         }
 267         mutex_unlock(&vfio.iommu_drivers_lock);
 268 }
 269 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
 270
 271 /**
 272  * Group minor allocation/free - both called with vfio.group_lock held
 273  */
 274 static int vfio_alloc_group_minor(struct vfio_group *group)
 275 {
 276         return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
 277 }
 278
 279 static void vfio_free_group_minor(int minor)
 280 {
 281         idr_remove(&vfio.group_idr, minor);
 282 }
 283
 284 static int vfio_iommu_group_notifier(struct notifier_block *nb,
 285                                      unsigned long action, void *data);
 286 static void vfio_group_get(struct vfio_group *group);
 287
 288 /**
 289  * Container objects - containers are created when /dev/vfio/vfio is
 290  * opened, but their lifecycle extends until the last user is done, so
 291  * it's freed via kref.  Must support container/group/device being
 292  * closed in any order.
 293  */
 294 static void vfio_container_get(struct vfio_container *container)
 295 {
 296         kref_get(&container->kref);
 297 }
 298
 299 static void vfio_container_release(struct kref *kref)
 300 {
 301         struct vfio_container *container;
 302         container = container_of(kref, struct vfio_container, kref);
 303
 304         kfree(container);
 305 }
 306
 307 static void vfio_container_put(struct vfio_container *container)
 308 {
 309         kref_put(&container->kref, vfio_container_release);
 310 }
 311
 312 static void vfio_group_unlock_and_free(struct vfio_group *group)
 313 {
 314         mutex_unlock(&vfio.group_lock);
 315         /*
 316          * Unregister outside of lock.  A spurious callback is harmless now
 317          * that the group is no longer in vfio.group_list.
 318          */
 319         iommu_group_unregister_notifier(group->iommu_group, &group->nb);
 320         kfree(group);
 321 }
 322
 323 /**
 324  * Group objects - create, release, get, put, search
 325  */
 326 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 327 {
 328         struct vfio_group *group, *tmp;
 329         struct device *dev;
 330         int ret, minor;
 331
 332         group = kzalloc(sizeof(*group), GFP_KERNEL);
 333         if (!group)
 334                 return ERR_PTR(-ENOMEM);
 335
 336         kref_init(&group->kref);
 337         INIT_LIST_HEAD(&group->device_list);
 338         mutex_init(&group->device_lock);
 339         INIT_LIST_HEAD(&group->unbound_list);
 340         mutex_init(&group->unbound_lock);
 341         atomic_set(&group->container_users, 0);
 342         atomic_set(&group->opened, 0);
 343         init_waitqueue_head(&group->container_q);
 344         group->iommu_group = iommu_group;
 345 #ifdef CONFIG_VFIO_NOIOMMU
 346         group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
 347 #endif
 348         BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
 349
 350         group->nb.notifier_call = vfio_iommu_group_notifier;
 351
 352         /*
 353          * blocking notifiers acquire a rwsem around registering and hold
 354          * it around callback.  Therefore, need to register outside of
 355          * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
 356          * do anything unless it can find the group in vfio.group_list, so
 357          * no harm in registering early.
 358          */
 359         ret = iommu_group_register_notifier(iommu_group, &group->nb);
 360         if (ret) {
 361                 kfree(group);
 362                 return ERR_PTR(ret);
 363         }
 364
 365         mutex_lock(&vfio.group_lock);
 366
 367         /* Did we race creating this group? */
 368         list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
 369                 if (tmp->iommu_group == iommu_group) {
 370                         vfio_group_get(tmp);
 371                         vfio_group_unlock_and_free(group);
 372                         return tmp;
 373                 }
 374         }
 375
 376         minor = vfio_alloc_group_minor(group);
 377         if (minor < 0) {
 378                 vfio_group_unlock_and_free(group);
 379                 return ERR_PTR(minor);
 380         }
 381
 382         dev = device_create(vfio.class, NULL,
 383                             MKDEV(MAJOR(vfio.group_devt), minor),
 384                             group, "%s%d", group->noiommu ? "noiommu-" : "",
 385                             iommu_group_id(iommu_group));
 386         if (IS_ERR(dev)) {
 387                 vfio_free_group_minor(minor);
 388                 vfio_group_unlock_and_free(group);
 389                 return ERR_CAST(dev);
 390         }
 391
 392         group->minor = minor;
 393         group->dev = dev;
 394
 395         list_add(&group->vfio_next, &vfio.group_list);
 396
 397         mutex_unlock(&vfio.group_lock);
 398
 399         return group;
 400 }
 401
 402 /* called with vfio.group_lock held */
 403 static void vfio_group_release(struct kref *kref)
 404 {
 405         struct vfio_group *group = container_of(kref, struct vfio_group, kref);
 406         struct vfio_unbound_dev *unbound, *tmp;
 407         struct iommu_group *iommu_group = group->iommu_group;
 408
 409         WARN_ON(!list_empty(&group->device_list));
 410         WARN_ON(group->notifier.head);
 411
 412         list_for_each_entry_safe(unbound, tmp,
 413                                  &group->unbound_list, unbound_next) {
 414                 list_del(&unbound->unbound_next);
 415                 kfree(unbound);
 416         }
 417
 418         device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
 419         list_del(&group->vfio_next);
 420         vfio_free_group_minor(group->minor);
 421         vfio_group_unlock_and_free(group);
 422         iommu_group_put(iommu_group);
 423 }
 424
 425 static void vfio_group_put(struct vfio_group *group)
 426 {
 427         kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
 428 }
 429
 430 struct vfio_group_put_work {
 431         struct work_struct work;
 432         struct vfio_group *group;
 433 };
 434
 435 static void vfio_group_put_bg(struct work_struct *work)
 436 {
 437         struct vfio_group_put_work *do_work;
 438
 439         do_work = container_of(work, struct vfio_group_put_work, work);
 440
 441         vfio_group_put(do_work->group);
 442         kfree(do_work);
 443 }
 444
 445 static void vfio_group_schedule_put(struct vfio_group *group)
 446 {
 447         struct vfio_group_put_work *do_work;
 448
 449         do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
 450         if (WARN_ON(!do_work))
 451                 return;
 452
 453         INIT_WORK(&do_work->work, vfio_group_put_bg);
 454         do_work->group = group;
 455         schedule_work(&do_work->work);
 456 }
 457
 458 /* Assume group_lock or group reference is held */
 459 static void vfio_group_get(struct vfio_group *group)
 460 {
 461         kref_get(&group->kref);
 462 }
 463
 464 /*
 465  * Not really a try as we will sleep for mutex, but we need to make
 466  * sure the group pointer is valid under lock and get a reference.
 467  */
 468 static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
 469 {
 470         struct vfio_group *target = group;
 471
 472         mutex_lock(&vfio.group_lock);
 473         list_for_each_entry(group, &vfio.group_list, vfio_next) {
 474                 if (group == target) {
 475                         vfio_group_get(group);
 476                         mutex_unlock(&vfio.group_lock);
 477                         return group;
 478                 }
 479         }
 480         mutex_unlock(&vfio.group_lock);
 481
 482         return NULL;
 483 }
 484
 485 static
 486 struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
 487 {
 488         struct vfio_group *group;
 489
 490         mutex_lock(&vfio.group_lock);
 491         list_for_each_entry(group, &vfio.group_list, vfio_next) {
 492                 if (group->iommu_group == iommu_group) {
 493                         vfio_group_get(group);
 494                         mutex_unlock(&vfio.group_lock);
 495                         return group;
 496                 }
 497         }
 498         mutex_unlock(&vfio.group_lock);
 499
 500         return NULL;
 501 }
 502
 503 static struct vfio_group *vfio_group_get_from_minor(int minor)
 504 {
 505         struct vfio_group *group;
 506
 507         mutex_lock(&vfio.group_lock);
 508         group = idr_find(&vfio.group_idr, minor);
 509         if (!group) {
 510                 mutex_unlock(&vfio.group_lock);
 511                 return NULL;
 512         }
 513         vfio_group_get(group);
 514         mutex_unlock(&vfio.group_lock);
 515
 516         return group;
 517 }
 518
 519 static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
 520 {
 521         struct iommu_group *iommu_group;
 522         struct vfio_group *group;
 523
 524         iommu_group = iommu_group_get(dev);
 525         if (!iommu_group)
 526                 return NULL;
 527
 528         group = vfio_group_get_from_iommu(iommu_group);
 529         iommu_group_put(iommu_group);
 530
 531         return group;
 532 }
 533
 534 /**
 535  * Device objects - create, release, get, put, search
 536  */
 537 static
 538 struct vfio_device *vfio_group_create_device(struct vfio_group *group,
 539                                              struct device *dev,
 540                                              const struct vfio_device_ops *ops,
 541                                              void *device_data)
 542 {
 543         struct vfio_device *device;
 544
 545         device = kzalloc(sizeof(*device), GFP_KERNEL);
 546         if (!device)
 547                 return ERR_PTR(-ENOMEM);
 548
 549         kref_init(&device->kref);
 550         device->dev = dev;
 551         device->group = group;
 552         device->ops = ops;
 553         device->device_data = device_data;
 554         dev_set_drvdata(dev, device);
 555
 556         /* No need to get group_lock, caller has group reference */
 557         vfio_group_get(group);
 558
 559         mutex_lock(&group->device_lock);
 560         list_add(&device->group_next, &group->device_list);
 561         mutex_unlock(&group->device_lock);
 562
 563         return device;
 564 }
 565
 566 static void vfio_device_release(struct kref *kref)
 567 {
 568         struct vfio_device *device = container_of(kref,
 569                                                   struct vfio_device, kref);
 570         struct vfio_group *group = device->group;
 571
 572         list_del(&device->group_next);
 573         mutex_unlock(&group->device_lock);
 574
 575         dev_set_drvdata(device->dev, NULL);
 576
 577         kfree(device);
 578
 579         /* vfio_del_group_dev may be waiting for this device */
 580         wake_up(&vfio.release_q);
 581 }
 582
 583 /* Device reference always implies a group reference */
 584 void vfio_device_put(struct vfio_device *device)
 585 {
 586         struct vfio_group *group = device->group;
 587         kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
 588         vfio_group_put(group);
 589 }
 590 EXPORT_SYMBOL_GPL(vfio_device_put);
 591
 592 static void vfio_device_get(struct vfio_device *device)
 593 {
 594         vfio_group_get(device->group);
 595         kref_get(&device->kref);
 596 }
 597
 598 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
 599                                                  struct device *dev)
 600 {
 601         struct vfio_device *device;
 602
 603         mutex_lock(&group->device_lock);
 604         list_for_each_entry(device, &group->device_list, group_next) {
 605                 if (device->dev == dev) {
 606                         vfio_device_get(device);
 607                         mutex_unlock(&group->device_lock);
 608                         return device;
 609                 }
 610         }
 611         mutex_unlock(&group->device_lock);
 612         return NULL;
 613 }
 614
 615 /*
 616  * Some drivers, like pci-stub, are only used to prevent other drivers from
 617  * claiming a device and are therefore perfectly legitimate for a user owned
 618  * group.  The pci-stub driver has no dependencies on DMA or the IOVA mapping
 619  * of the device, but it does prevent the user from having direct access to
 620  * the device, which is useful in some circumstances.
 621  *
 622  * We also assume that we can include PCI interconnect devices, ie. bridges.
 623  * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
 624  * then all of the downstream devices will be part of the same IOMMU group as
 625  * the bridge.  Thus, if placing the bridge into the user owned IOVA space
 626  * breaks anything, it only does so for user owned devices downstream.  Note
 627  * that error notification via MSI can be affected for platforms that handle
 628  * MSI within the same IOVA space as DMA.
 629  */
 630 static const char * const vfio_driver_whitelist[] = { "pci-stub" };
 631
 632 static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
 633 {
 634         int i;
 635
 636         if (dev_is_pci(dev)) {
 637                 struct pci_dev *pdev = to_pci_dev(dev);
 638
 639                 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
 640                         return true;
 641         }
 642
 643         for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) {
 644                 if (!strcmp(drv->name, vfio_driver_whitelist[i]))
 645                         return true;
 646         }
 647
 648         return false;
 649 }
 650
 651 /*
 652  * A vfio group is viable for use by userspace if all devices are in
 653  * one of the following states:
 654  *  - driver-less
 655  *  - bound to a vfio driver
 656  *  - bound to a whitelisted driver
 657  *  - a PCI interconnect device
 658  *
 659  * We use two methods to determine whether a device is bound to a vfio
 660  * driver.  The first is to test whether the device exists in the vfio
 661  * group.  The second is to test if the device exists on the group
 662  * unbound_list, indicating it's in the middle of transitioning from
 663  * a vfio driver to driver-less.
 664  */
 665 static int vfio_dev_viable(struct device *dev, void *data)
 666 {
 667         struct vfio_group *group = data;
 668         struct vfio_device *device;
 669         struct device_driver *drv = ACCESS_ONCE(dev->driver);
 670         struct vfio_unbound_dev *unbound;
 671         int ret = -EINVAL;
 672
 673         mutex_lock(&group->unbound_lock);
 674         list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
 675                 if (dev == unbound->dev) {
 676                         ret = 0;
 677                         break;
 678                 }
 679         }
 680         mutex_unlock(&group->unbound_lock);
 681
 682         if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
 683                 return 0;
 684
 685         device = vfio_group_get_device(group, dev);
 686         if (device) {
 687                 vfio_device_put(device);
 688                 return 0;
 689         }
 690
 691         return ret;
 692 }
 693
 694 /**
 695  * Async device support
 696  */
 697 static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
 698 {
 699         struct vfio_device *device;
 700
 701         /* Do we already know about it?  We shouldn't */
 702         device = vfio_group_get_device(group, dev);
 703         if (WARN_ON_ONCE(device)) {
 704                 vfio_device_put(device);
 705                 return 0;
 706         }
 707
 708         /* Nothing to do for idle groups */
 709         if (!atomic_read(&group->container_users))
 710                 return 0;
 711
 712         /* TODO Prevent device auto probing */
 713         WARN(1, "Device %s added to live group %d!\n", dev_name(dev),
 714              iommu_group_id(group->iommu_group));
 715
 716         return 0;
 717 }
 718
 719 static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
 720 {
 721         /* We don't care what happens when the group isn't in use */
 722         if (!atomic_read(&group->container_users))
 723                 return 0;
 724
 725         return vfio_dev_viable(dev, group);
 726 }
 727
 728 static int vfio_iommu_group_notifier(struct notifier_block *nb,
 729                                      unsigned long action, void *data)
 730 {
 731         struct vfio_group *group = container_of(nb, struct vfio_group, nb);
 732         struct device *dev = data;
 733         struct vfio_unbound_dev *unbound;
 734
 735         /*
 736          * Need to go through a group_lock lookup to get a reference or we
 737          * risk racing a group being removed.  Ignore spurious notifies.
 738          */
 739         group = vfio_group_try_get(group);
 740         if (!group)
 741                 return NOTIFY_OK;
 742
 743         switch (action) {
 744         case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
 745                 vfio_group_nb_add_dev(group, dev);
 746                 break;
 747         case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
 748                 /*
 749                  * Nothing to do here.  If the device is in use, then the
 750                  * vfio sub-driver should block the remove callback until
 751                  * it is unused.  If the device is unused or attached to a
 752                  * stub driver, then it should be released and we don't
 753                  * care that it will be going away.
 754                  */
 755                 break;
 756         case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
 757                 pr_debug("%s: Device %s, group %d binding to driver\n",
 758                          __func__, dev_name(dev),
 759                          iommu_group_id(group->iommu_group));
 760                 break;
 761         case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
 762                 pr_debug("%s: Device %s, group %d bound to driver %s\n",
 763                          __func__, dev_name(dev),
 764                          iommu_group_id(group->iommu_group), dev->driver->name);
 765                 BUG_ON(vfio_group_nb_verify(group, dev));
 766                 break;
 767         case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
 768                 pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
 769                          __func__, dev_name(dev),
 770                          iommu_group_id(group->iommu_group), dev->driver->name);
 771                 break;
 772         case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
 773                 pr_debug("%s: Device %s, group %d unbound from driver\n",
 774                          __func__, dev_name(dev),
 775                          iommu_group_id(group->iommu_group));
 776                 /*
 777                  * XXX An unbound device in a live group is ok, but we'd
 778                  * really like to avoid the above BUG_ON by preventing other
 779                  * drivers from binding to it.  Once that occurs, we have to
 780                  * stop the system to maintain isolation.  At a minimum, we'd
 781                  * want a toggle to disable driver auto probe for this device.
 782                  */
 783
 784                 mutex_lock(&group->unbound_lock);
 785                 list_for_each_entry(unbound,
 786                                     &group->unbound_list, unbound_next) {
 787                         if (dev == unbound->dev) {
 788                                 list_del(&unbound->unbound_next);
 789                                 kfree(unbound);
 790                                 break;
 791                         }
 792                 }
 793                 mutex_unlock(&group->unbound_lock);
 794                 break;
 795         }
 796
 797         /*
 798          * If we're the last reference to the group, the group will be
 799          * released, which includes unregistering the iommu group notifier.
 800          * We hold a read-lock on that notifier list, unregistering needs
 801          * a write-lock... deadlock.  Release our reference asynchronously
 802          * to avoid that situation.
 803          */
 804         vfio_group_schedule_put(group);
 805         return NOTIFY_OK;
 806 }
 807
 808 /**
 809  * VFIO driver API
 810  */
 811 int vfio_add_group_dev(struct device *dev,
 812                        const struct vfio_device_ops *ops, void *device_data)
 813 {
 814         struct iommu_group *iommu_group;
 815         struct vfio_group *group;
 816         struct vfio_device *device;
 817
 818         iommu_group = iommu_group_get(dev);
 819         if (!iommu_group)
 820                 return -EINVAL;
 821
 822         group = vfio_group_get_from_iommu(iommu_group);
 823         if (!group) {
 824                 group = vfio_create_group(iommu_group);
 825                 if (IS_ERR(group)) {
 826                         iommu_group_put(iommu_group);
 827                         return PTR_ERR(group);
 828                 }
 829         } else {
 830                 /*
 831                  * A found vfio_group already holds a reference to the
 832                  * iommu_group.  A created vfio_group keeps the reference.
 833                  */
 834                 iommu_group_put(iommu_group);
 835         }
 836
 837         device = vfio_group_get_device(group, dev);
 838         if (device) {
 839                 WARN(1, "Device %s already exists on group %d\n",
 840                      dev_name(dev), iommu_group_id(iommu_group));
 841                 vfio_device_put(device);
 842                 vfio_group_put(group);
 843                 return -EBUSY;
 844         }
 845
 846         device = vfio_group_create_device(group, dev, ops, device_data);
 847         if (IS_ERR(device)) {
 848                 vfio_group_put(group);
 849                 return PTR_ERR(device);
 850         }
 851
 852         /*
 853          * Drop all but the vfio_device reference.  The vfio_device holds
 854          * a reference to the vfio_group, which holds a reference to the
 855          * iommu_group.
 856          */
 857         vfio_group_put(group);
 858
 859         return 0;
 860 }
 861 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
 862
 863 /**
 864  * Get a reference to the vfio_device for a device.  Even if the
 865  * caller thinks they own the device, they could be racing with a
 866  * release call path, so we can't trust drvdata for the shortcut.
 867  * Go the long way around, from the iommu_group to the vfio_group
 868  * to the vfio_device.
 869  */
 870 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
 871 {
 872         struct vfio_group *group;
 873         struct vfio_device *device;
 874
 875         group = vfio_group_get_from_dev(dev);
 876         if (!group)
 877                 return NULL;
 878
 879         device = vfio_group_get_device(group, dev);
 880         vfio_group_put(group);
 881
 882         return device;
 883 }
 884 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
 885
 886 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 887                                                      char *buf)
 888 {
 889         struct vfio_device *it, *device = NULL;
 890
 891         mutex_lock(&group->device_lock);
 892         list_for_each_entry(it, &group->device_list, group_next) {
 893                 if (!strcmp(dev_name(it->dev), buf)) {
 894                         device = it;
 895                         vfio_device_get(device);
 896                         break;
 897                 }
 898         }
 899         mutex_unlock(&group->device_lock);
 900
 901         return device;
 902 }
 903
 904 /*
 905  * Caller must hold a reference to the vfio_device
 906  */
 907 void *vfio_device_data(struct vfio_device *device)
 908 {
 909         return device->device_data;
 910 }
 911 EXPORT_SYMBOL_GPL(vfio_device_data);
 912
 913 /*
 914  * Decrement the device reference count and wait for the device to be
 915  * removed.  Open file descriptors for the device... */
 916 void *vfio_del_group_dev(struct device *dev)
 917 {
 918         DEFINE_WAIT_FUNC(wait, woken_wake_function);
 919         struct vfio_device *device = dev_get_drvdata(dev);
 920         struct vfio_group *group = device->group;
 921         void *device_data = device->device_data;
 922         struct vfio_unbound_dev *unbound;
 923         unsigned int i = 0;
 924         bool interrupted = false;
 925
 926         /*
 927          * The group exists so long as we have a device reference.  Get
 928          * a group reference and use it to scan for the device going away.
 929          */
 930         vfio_group_get(group);
 931
 932         /*
 933          * When the device is removed from the group, the group suddenly
 934          * becomes non-viable; the device has a driver (until the unbind
 935          * completes), but it's not present in the group.  This is bad news
 936          * for any external users that need to re-acquire a group reference
 937          * in order to match and release their existing reference.  To
 938          * solve this, we track such devices on the unbound_list to bridge
 939          * the gap until they're fully unbound.
 940          */
 941         unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
 942         if (unbound) {
 943                 unbound->dev = dev;
 944                 mutex_lock(&group->unbound_lock);
 945                 list_add(&unbound->unbound_next, &group->unbound_list);
 946                 mutex_unlock(&group->unbound_lock);
 947         }
 948         WARN_ON(!unbound);
 949
 950         vfio_device_put(device);
 951
 952         /*
 953          * If the device is still present in the group after the above
 954          * 'put', then it is in use and we need to request it from the
 955          * bus driver.  The driver may in turn need to request the
 956          * device from the user.  We send the request on an arbitrary
 957          * interval with counter to allow the driver to take escalating
 958          * measures to release the device if it has the ability to do so.
 959          */
 960         add_wait_queue(&vfio.release_q, &wait);
 961
 962         do {
 963                 device = vfio_group_get_device(group, dev);
 964                 if (!device)
 965                         break;
 966
 967                 if (device->ops->request)
 968                         device->ops->request(device_data, i++);
 969
 970                 vfio_device_put(device);
 971
 972                 if (interrupted) {
 973                         wait_woken(&wait, TASK_UNINTERRUPTIBLE, HZ * 10);
 974                 } else {
 975                         wait_woken(&wait, TASK_INTERRUPTIBLE, HZ * 10);
 976                         if (signal_pending(current)) {
 977                                 interrupted = true;
 978                                 dev_warn(dev,
 979                                          "Device is currently in use, task"
 980                                          " \"%s\" (%d) "
 981                                          "blocked until device is released",
 982                                          current->comm, task_pid_nr(current));
 983                         }
 984                 }
 985
 986         } while (1);
 987
 988         remove_wait_queue(&vfio.release_q, &wait);
 989         /*
 990          * In order to support multiple devices per group, devices can be
 991          * plucked from the group while other devices in the group are still
 992          * in use.  The container persists with this group and those remaining
 993          * devices still attached.  If the user creates an isolation violation
 994          * by binding this device to another driver while the group is still in
 995          * use, that's their fault.  However, in the case of removing the last,
 996          * or potentially the only, device in the group there can be no other
 997          * in-use devices in the group.  The user has done their due diligence
 998          * and we should lay no claims to those devices.  In order to do that,
 999          * we need to make sure the group is detached from the container.
1000          * Without this stall, we're potentially racing with a user process
1001          * that may attempt to immediately bind this device to another driver.
1002          */
1003         if (list_empty(&group->device_list))
1004                 wait_event(group->container_q, !group->container);
1005
1006         vfio_group_put(group);
1007
1008         return device_data;
1009 }
1010 EXPORT_SYMBOL_GPL(vfio_del_group_dev);
1011
1012 /**
1013  * VFIO base fd, /dev/vfio/vfio
1014  */
1015 static long vfio_ioctl_check_extension(struct vfio_container *container,
1016                                        unsigned long arg)
1017 {
1018         struct vfio_iommu_driver *driver;
1019         long ret = 0;
1020
1021         down_read(&container->group_lock);
1022
1023         driver = container->iommu_driver;
1024
1025         switch (arg) {
1026                 /* No base extensions yet */
1027         default:
1028                 /*
1029                  * If no driver is set, poll all registered drivers for
1030                  * extensions and return the first positive result.  If
1031                  * a driver is already set, further queries will be passed
1032                  * only to that driver.
1033                  */
1034                 if (!driver) {
1035                         mutex_lock(&vfio.iommu_drivers_lock);
1036                         list_for_each_entry(driver, &vfio.iommu_drivers_list,
1037                                             vfio_next) {
1038
1039 #ifdef CONFIG_VFIO_NOIOMMU
1040                                 if (!list_empty(&container->group_list) &&
1041                                     (container->noiommu !=
1042                                      (driver->ops == &vfio_noiommu_ops)))
1043                                         continue;
1044 #endif
1045
1046                                 if (!try_module_get(driver->ops->owner))
1047                                         continue;
1048
1049                                 ret = driver->ops->ioctl(NULL,
1050                                                          VFIO_CHECK_EXTENSION,
1051                                                          arg);
1052                                 module_put(driver->ops->owner);
1053                                 if (ret > 0)
1054                                         break;
1055                         }
1056                         mutex_unlock(&vfio.iommu_drivers_lock);
1057                 } else
1058                         ret = driver->ops->ioctl(container->iommu_data,
1059                                                  VFIO_CHECK_EXTENSION, arg);
1060         }
1061
1062         up_read(&container->group_lock);
1063
1064         return ret;
1065 }
1066
1067 /* hold write lock on container->group_lock */
1068 static int __vfio_container_attach_groups(struct vfio_container *container,
1069                                           struct vfio_iommu_driver *driver,
1070                                           void *data)
1071 {
1072         struct vfio_group *group;
1073         int ret = -ENODEV;
1074
1075         list_for_each_entry(group, &container->group_list, container_next) {
1076                 ret = driver->ops->attach_group(data, group->iommu_group);
1077                 if (ret)
1078                         goto unwind;
1079         }
1080
1081         return ret;
1082
1083 unwind:
1084         list_for_each_entry_continue_reverse(group, &container->group_list,
1085                                              container_next) {
1086                 driver->ops->detach_group(data, group->iommu_group);
1087         }
1088
1089         return ret;
1090 }
1091
1092 static long vfio_ioctl_set_iommu(struct vfio_container *container,
1093                                  unsigned long arg)
1094 {
1095         struct vfio_iommu_driver *driver;
1096         long ret = -ENODEV;
1097
1098         down_write(&container->group_lock);
1099
1100         /*
1101          * The container is designed to be an unprivileged interface while
1102          * the group can be assigned to specific users.  Therefore, only by
1103          * adding a group to a container does the user get the privilege of
1104          * enabling the iommu, which may allocate finite resources.  There
1105          * is no unset_iommu, but by removing all the groups from a container,
1106          * the container is deprivileged and returns to an unset state.
1107          */
1108         if (list_empty(&container->group_list) || container->iommu_driver) {
1109                 up_write(&container->group_lock);
1110                 return -EINVAL;
1111         }
1112
1113         mutex_lock(&vfio.iommu_drivers_lock);
1114         list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1115                 void *data;
1116
1117 #ifdef CONFIG_VFIO_NOIOMMU
1118                 /*
1119                  * Only noiommu containers can use vfio-noiommu and noiommu
1120                  * containers can only use vfio-noiommu.
1121                  */
1122                 if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1123                         continue;
1124 #endif
1125
1126                 if (!try_module_get(driver->ops->owner))
1127                         continue;
1128
1129                 /*
1130                  * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1131                  * so test which iommu driver reported support for this
1132                  * extension and call open on them.  We also pass them the
1133                  * magic, allowing a single driver to support multiple
1134                  * interfaces if they'd like.
1135                  */
1136                 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1137                         module_put(driver->ops->owner);
1138                         continue;
1139                 }
1140
1141                 data = driver->ops->open(arg);
1142                 if (IS_ERR(data)) {
1143                         ret = PTR_ERR(data);
1144                         module_put(driver->ops->owner);
1145                         continue;
1146                 }
1147
1148                 ret = __vfio_container_attach_groups(container, driver, data);
1149                 if (ret) {
1150                         driver->ops->release(data);
1151                         module_put(driver->ops->owner);
1152                         continue;
1153                 }
1154
1155                 container->iommu_driver = driver;
1156                 container->iommu_data = data;
1157                 break;
1158         }
1159
1160         mutex_unlock(&vfio.iommu_drivers_lock);
1161         up_write(&container->group_lock);
1162
1163         return ret;
1164 }
1165
1166 static long vfio_fops_unl_ioctl(struct file *filep,
1167                                 unsigned int cmd, unsigned long arg)
1168 {
1169         struct vfio_container *container = filep->private_data;
1170         struct vfio_iommu_driver *driver;
1171         void *data;
1172         long ret = -EINVAL;
1173
1174         if (!container)
1175                 return ret;
1176
1177         switch (cmd) {
1178         case VFIO_GET_API_VERSION:
1179                 ret = VFIO_API_VERSION;
1180                 break;
1181         case VFIO_CHECK_EXTENSION:
1182                 ret = vfio_ioctl_check_extension(container, arg);
1183                 break;
1184         case VFIO_SET_IOMMU:
1185                 ret = vfio_ioctl_set_iommu(container, arg);
1186                 break;
1187         default:
1188                 driver = container->iommu_driver;
1189                 data = container->iommu_data;
1190
1191                 if (driver) /* passthrough all unrecognized ioctls */
1192                         ret = driver->ops->ioctl(data, cmd, arg);
1193         }
1194
1195         return ret;
1196 }
1197
1198 #ifdef CONFIG_COMPAT
1199 static long vfio_fops_compat_ioctl(struct file *filep,
1200                                    unsigned int cmd, unsigned long arg)
1201 {
1202         arg = (unsigned long)compat_ptr(arg);
1203         return vfio_fops_unl_ioctl(filep, cmd, arg);
1204 }
1205 #endif  /* CONFIG_COMPAT */
1206
1207 static int vfio_fops_open(struct inode *inode, struct file *filep)
1208 {
1209         struct vfio_container *container;
1210
1211         container = kzalloc(sizeof(*container), GFP_KERNEL);
1212         if (!container)
1213                 return -ENOMEM;
1214
1215         INIT_LIST_HEAD(&container->group_list);
1216         init_rwsem(&container->group_lock);
1217         kref_init(&container->kref);
1218
1219         filep->private_data = container;
1220
1221         return 0;
1222 }
1223
1224 static int vfio_fops_release(struct inode *inode, struct file *filep)
1225 {
1226         struct vfio_container *container = filep->private_data;
1227
1228         filep->private_data = NULL;
1229
1230         vfio_container_put(container);
1231
1232         return 0;
1233 }
1234
1235 /*
1236  * Once an iommu driver is set, we optionally pass read/write/mmap
1237  * on to the driver, allowing management interfaces beyond ioctl.
1238  */
1239 static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1240                               size_t count, loff_t *ppos)
1241 {
1242         struct vfio_container *container = filep->private_data;
1243         struct vfio_iommu_driver *driver;
1244         ssize_t ret = -EINVAL;
1245
1246         driver = container->iommu_driver;
1247         if (likely(driver && driver->ops->read))
1248                 ret = driver->ops->read(container->iommu_data,
1249                                         buf, count, ppos);
1250
1251         return ret;
1252 }
1253
1254 static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1255                                size_t count, loff_t *ppos)
1256 {
1257         struct vfio_container *container = filep->private_data;
1258         struct vfio_iommu_driver *driver;
1259         ssize_t ret = -EINVAL;
1260
1261         driver = container->iommu_driver;
1262         if (likely(driver && driver->ops->write))
1263                 ret = driver->ops->write(container->iommu_data,
1264                                          buf, count, ppos);
1265
1266         return ret;
1267 }
1268
1269 static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1270 {
1271         struct vfio_container *container = filep->private_data;
1272         struct vfio_iommu_driver *driver;
1273         int ret = -EINVAL;
1274
1275         driver = container->iommu_driver;
1276         if (likely(driver && driver->ops->mmap))
1277                 ret = driver->ops->mmap(container->iommu_data, vma);
1278
1279         return ret;
1280 }
1281
1282 static const struct file_operations vfio_fops = {
1283         .owner          = THIS_MODULE,
1284         .open           = vfio_fops_open,
1285         .release        = vfio_fops_release,
1286         .read           = vfio_fops_read,
1287         .write          = vfio_fops_write,
1288         .unlocked_ioctl = vfio_fops_unl_ioctl,
1289 #ifdef CONFIG_COMPAT
1290         .compat_ioctl   = vfio_fops_compat_ioctl,
1291 #endif
1292         .mmap           = vfio_fops_mmap,
1293 };
1294
1295 /**
1296  * VFIO Group fd, /dev/vfio/$GROUP
1297  */
1298 static void __vfio_group_unset_container(struct vfio_group *group)
1299 {
1300         struct vfio_container *container = group->container;
1301         struct vfio_iommu_driver *driver;
1302
1303         down_write(&container->group_lock);
1304
1305         driver = container->iommu_driver;
1306         if (driver)
1307                 driver->ops->detach_group(container->iommu_data,
1308                                           group->iommu_group);
1309
1310         group->container = NULL;
1311         wake_up(&group->container_q);
1312         list_del(&group->container_next);
1313
1314         /* Detaching the last group deprivileges a container, remove iommu */
1315         if (driver && list_empty(&container->group_list)) {
1316                 driver->ops->release(container->iommu_data);
1317                 module_put(driver->ops->owner);
1318                 container->iommu_driver = NULL;
1319                 container->iommu_data = NULL;
1320         }
1321
1322         up_write(&container->group_lock);
1323
1324         vfio_container_put(container);
1325 }
1326
1327 /*
1328  * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1329  * if there was no container to unset.  Since the ioctl is called on
1330  * the group, we know that still exists, therefore the only valid
1331  * transition here is 1->0.
1332  */
1333 static int vfio_group_unset_container(struct vfio_group *group)
1334 {
1335         int users = atomic_cmpxchg(&group->container_users, 1, 0);
1336
1337         if (!users)
1338                 return -EINVAL;
1339         if (users != 1)
1340                 return -EBUSY;
1341
1342         __vfio_group_unset_container(group);
1343
1344         return 0;
1345 }
1346
1347 /*
1348  * When removing container users, anything that removes the last user
1349  * implicitly removes the group from the container.  That is, if the
1350  * group file descriptor is closed, as well as any device file descriptors,
1351  * the group is free.
1352  */
1353 static void vfio_group_try_dissolve_container(struct vfio_group *group)
1354 {
1355         if (0 == atomic_dec_if_positive(&group->container_users))
1356                 __vfio_group_unset_container(group);
1357 }
1358
1359 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1360 {
1361         struct fd f;
1362         struct vfio_container *container;
1363         struct vfio_iommu_driver *driver;
1364         int ret = 0;
1365
1366         if (atomic_read(&group->container_users))
1367                 return -EINVAL;
1368
1369         if (group->noiommu && !capable(CAP_SYS_RAWIO))
1370                 return -EPERM;
1371
1372         f = fdget(container_fd);
1373         if (!f.file)
1374                 return -EBADF;
1375
1376         /* Sanity check, is this really our fd? */
1377         if (f.file->f_op != &vfio_fops) {
1378                 fdput(f);
1379                 return -EINVAL;
1380         }
1381
1382         container = f.file->private_data;
1383         WARN_ON(!container); /* fget ensures we don't race vfio_release */
1384
1385         down_write(&container->group_lock);
1386
1387         /* Real groups and fake groups cannot mix */
1388         if (!list_empty(&container->group_list) &&
1389             container->noiommu != group->noiommu) {
1390                 ret = -EPERM;
1391                 goto unlock_out;
1392         }
1393
1394         driver = container->iommu_driver;
1395         if (driver) {
1396                 ret = driver->ops->attach_group(container->iommu_data,
1397                                                 group->iommu_group);
1398                 if (ret)
1399                         goto unlock_out;
1400         }
1401
1402         group->container = container;
1403         container->noiommu = group->noiommu;
1404         list_add(&group->container_next, &container->group_list);
1405
1406         /* Get a reference on the container and mark a user within the group */
1407         vfio_container_get(container);
1408         atomic_inc(&group->container_users);
1409
1410 unlock_out:
1411         up_write(&container->group_lock);
1412         fdput(f);
1413         return ret;
1414 }
1415
1416 static bool vfio_group_viable(struct vfio_group *group)
1417 {
1418         return (iommu_group_for_each_dev(group->iommu_group,
1419                                          group, vfio_dev_viable) == 0);
1420 }
1421
1422 static int vfio_group_add_container_user(struct vfio_group *group)
1423 {
1424         if (!atomic_inc_not_zero(&group->container_users))
1425                 return -EINVAL;
1426
1427         if (group->noiommu) {
1428                 atomic_dec(&group->container_users);
1429                 return -EPERM;
1430         }
1431         if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1432                 atomic_dec(&group->container_users);
1433                 return -EINVAL;
1434         }
1435
1436         return 0;
1437 }
1438
1439 static const struct file_operations vfio_device_fops;
1440
1441 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1442 {
1443         struct vfio_device *device;
1444         struct file *filep;
1445         int ret;
1446
1447         if (0 == atomic_read(&group->container_users) ||
1448             !group->container->iommu_driver || !vfio_group_viable(group))
1449                 return -EINVAL;
1450
1451         if (group->noiommu && !capable(CAP_SYS_RAWIO))
1452                 return -EPERM;
1453
1454         device = vfio_device_get_from_name(group, buf);
1455         if (!device)
1456                 return -ENODEV;
1457
1458         ret = device->ops->open(device->device_data);
1459         if (ret) {
1460                 vfio_device_put(device);
1461                 return ret;
1462         }
1463
1464         /*
1465          * We can't use anon_inode_getfd() because we need to modify
1466          * the f_mode flags directly to allow more than just ioctls
1467          */
1468         ret = get_unused_fd_flags(O_CLOEXEC);
1469         if (ret < 0) {
1470                 device->ops->release(device->device_data);
1471                 vfio_device_put(device);
1472                 return ret;
1473         }
1474
1475         filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1476                                    device, O_RDWR);
1477         if (IS_ERR(filep)) {
1478                 put_unused_fd(ret);
1479                 ret = PTR_ERR(filep);
1480                 device->ops->release(device->device_data);
1481                 vfio_device_put(device);
1482                 return ret;
1483         }
1484
1485         /*
1486          * TODO: add an anon_inode interface to do this.
1487          * Appears to be missing by lack of need rather than
1488          * explicitly prevented.  Now there's need.
1489          */
1490         filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1491
1492         atomic_inc(&group->container_users);
1493
1494         fd_install(ret, filep);
1495
1496         if (group->noiommu)
1497                 dev_warn(device->dev, "vfio-noiommu device opened by user "
1498                          "(%s:%d)\n", current->comm, task_pid_nr(current));
1499
1500         return ret;
1501 }
1502
1503 static long vfio_group_fops_unl_ioctl(struct file *filep,
1504                                       unsigned int cmd, unsigned long arg)
1505 {
1506         struct vfio_group *group = filep->private_data;
1507         long ret = -ENOTTY;
1508
1509         switch (cmd) {
1510         case VFIO_GROUP_GET_STATUS:
1511         {
1512                 struct vfio_group_status status;
1513                 unsigned long minsz;
1514
1515                 minsz = offsetofend(struct vfio_group_status, flags);
1516
1517                 if (copy_from_user(&status, (void __user *)arg, minsz))
1518                         return -EFAULT;
1519
1520                 if (status.argsz < minsz)
1521                         return -EINVAL;
1522
1523                 status.flags = 0;
1524
1525                 if (vfio_group_viable(group))
1526                         status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1527
1528                 if (group->container)
1529                         status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1530
1531                 if (copy_to_user((void __user *)arg, &status, minsz))
1532                         return -EFAULT;
1533
1534                 ret = 0;
1535                 break;
1536         }
1537         case VFIO_GROUP_SET_CONTAINER:
1538         {
1539                 int fd;
1540
1541                 if (get_user(fd, (int __user *)arg))
1542                         return -EFAULT;
1543
1544                 if (fd < 0)
1545                         return -EINVAL;
1546
1547                 ret = vfio_group_set_container(group, fd);
1548                 break;
1549         }
1550         case VFIO_GROUP_UNSET_CONTAINER:
1551                 ret = vfio_group_unset_container(group);
1552                 break;
1553         case VFIO_GROUP_GET_DEVICE_FD:
1554         {
1555                 char *buf;
1556
1557                 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1558                 if (IS_ERR(buf))
1559                         return PTR_ERR(buf);
1560
1561                 ret = vfio_group_get_device_fd(group, buf);
1562                 kfree(buf);
1563                 break;
1564         }
1565         }
1566
1567         return ret;
1568 }
1569
1570 #ifdef CONFIG_COMPAT
1571 static long vfio_group_fops_compat_ioctl(struct file *filep,
1572                                          unsigned int cmd, unsigned long arg)
1573 {
1574         arg = (unsigned long)compat_ptr(arg);
1575         return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1576 }
1577 #endif  /* CONFIG_COMPAT */
1578
1579 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1580 {
1581         struct vfio_group *group;
1582         int opened;
1583
1584         group = vfio_group_get_from_minor(iminor(inode));
1585         if (!group)
1586                 return -ENODEV;
1587
1588         if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1589                 vfio_group_put(group);
1590                 return -EPERM;
1591         }
1592
1593         /* Do we need multiple instances of the group open?  Seems not. */
1594         opened = atomic_cmpxchg(&group->opened, 0, 1);
1595         if (opened) {
1596                 vfio_group_put(group);
1597                 return -EBUSY;
1598         }
1599
1600         /* Is something still in use from a previous open? */
1601         if (group->container) {
1602                 atomic_dec(&group->opened);
1603                 vfio_group_put(group);
1604                 return -EBUSY;
1605         }
1606
1607         /* Warn if previous user didn't cleanup and re-init to drop them */
1608         if (WARN_ON(group->notifier.head))
1609                 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1610
1611         filep->private_data = group;
1612
1613         return 0;
1614 }
1615
1616 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1617 {
1618         struct vfio_group *group = filep->private_data;
1619
1620         filep->private_data = NULL;
1621
1622         vfio_group_try_dissolve_container(group);
1623
1624         atomic_dec(&group->opened);
1625
1626         vfio_group_put(group);
1627
1628         return 0;
1629 }
1630
1631 static const struct file_operations vfio_group_fops = {
1632         .owner          = THIS_MODULE,
1633         .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1634 #ifdef CONFIG_COMPAT
1635         .compat_ioctl   = vfio_group_fops_compat_ioctl,
1636 #endif
1637         .open           = vfio_group_fops_open,
1638         .release        = vfio_group_fops_release,
1639 };
1640
1641 /**
1642  * VFIO Device fd
1643  */
1644 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1645 {
1646         struct vfio_device *device = filep->private_data;
1647
1648         device->ops->release(device->device_data);
1649
1650         vfio_group_try_dissolve_container(device->group);
1651
1652         vfio_device_put(device);
1653
1654         return 0;
1655 }
1656
1657 static long vfio_device_fops_unl_ioctl(struct file *filep,
1658                                        unsigned int cmd, unsigned long arg)
1659 {
1660         struct vfio_device *device = filep->private_data;
1661
1662         if (unlikely(!device->ops->ioctl))
1663                 return -EINVAL;
1664
1665         return device->ops->ioctl(device->device_data, cmd, arg);
1666 }
1667
1668 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1669                                      size_t count, loff_t *ppos)
1670 {
1671         struct vfio_device *device = filep->private_data;
1672
1673         if (unlikely(!device->ops->read))
1674                 return -EINVAL;
1675
1676         return device->ops->read(device->device_data, buf, count, ppos);
1677 }
1678
1679 static ssize_t vfio_device_fops_write(struct file *filep,
1680                                       const char __user *buf,
1681                                       size_t count, loff_t *ppos)
1682 {
1683         struct vfio_device *device = filep->private_data;
1684
1685         if (unlikely(!device->ops->write))
1686                 return -EINVAL;
1687
1688         return device->ops->write(device->device_data, buf, count, ppos);
1689 }
1690
1691 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1692 {
1693         struct vfio_device *device = filep->private_data;
1694
1695         if (unlikely(!device->ops->mmap))
1696                 return -EINVAL;
1697
1698         return device->ops->mmap(device->device_data, vma);
1699 }
1700
1701 #ifdef CONFIG_COMPAT
1702 static long vfio_device_fops_compat_ioctl(struct file *filep,
1703                                           unsigned int cmd, unsigned long arg)
1704 {
1705         arg = (unsigned long)compat_ptr(arg);
1706         return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1707 }
1708 #endif  /* CONFIG_COMPAT */
1709
1710 static const struct file_operations vfio_device_fops = {
1711         .owner          = THIS_MODULE,
1712         .release        = vfio_device_fops_release,
1713         .read           = vfio_device_fops_read,
1714         .write          = vfio_device_fops_write,
1715         .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1716 #ifdef CONFIG_COMPAT
1717         .compat_ioctl   = vfio_device_fops_compat_ioctl,
1718 #endif
1719         .mmap           = vfio_device_fops_mmap,
1720 };
1721
1722 /**
1723  * External user API, exported by symbols to be linked dynamically.
1724  *
1725  * The protocol includes:
1726  *  1. do normal VFIO init operation:
1727  *      - opening a new container;
1728  *      - attaching group(s) to it;
1729  *      - setting an IOMMU driver for a container.
1730  * When IOMMU is set for a container, all groups in it are
1731  * considered ready to use by an external user.
1732  *
1733  * 2. User space passes a group fd to an external user.
1734  * The external user calls vfio_group_get_external_user()
1735  * to verify that:
1736  *      - the group is initialized;
1737  *      - IOMMU is set for it.
1738  * If both checks passed, vfio_group_get_external_user()
1739  * increments the container user counter to prevent
1740  * the VFIO group from disposal before KVM exits.
1741  *
1742  * 3. The external user calls vfio_external_user_iommu_id()
1743  * to know an IOMMU ID.
1744  *
1745  * 4. When the external KVM finishes, it calls
1746  * vfio_group_put_external_user() to release the VFIO group.
1747  * This call decrements the container user counter.
1748  */
1749 struct vfio_group *vfio_group_get_external_user(struct file *filep)
1750 {
1751         struct vfio_group *group = filep->private_data;
1752         int ret;
1753
1754         if (filep->f_op != &vfio_group_fops)
1755                 return ERR_PTR(-EINVAL);
1756
1757         ret = vfio_group_add_container_user(group);
1758         if (ret)
1759                 return ERR_PTR(ret);
1760
1761         vfio_group_get(group);
1762
1763         return group;
1764 }
1765 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1766
1767 void vfio_group_put_external_user(struct vfio_group *group)
1768 {
1769         vfio_group_try_dissolve_container(group);
1770         vfio_group_put(group);
1771 }
1772 EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1773
1774 bool vfio_external_group_match_file(struct vfio_group *test_group,
1775                                     struct file *filep)
1776 {
1777         struct vfio_group *group = filep->private_data;
1778
1779         return (filep->f_op == &vfio_group_fops) && (group == test_group);
1780 }
1781 EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1782
1783 int vfio_external_user_iommu_id(struct vfio_group *group)
1784 {
1785         return iommu_group_id(group->iommu_group);
1786 }
1787 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1788
1789 long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1790 {
1791         return vfio_ioctl_check_extension(group->container, arg);
1792 }
1793 EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1794
1795 /**
1796  * Sub-module support
1797  */
1798 /*
1799  * Helper for managing a buffer of info chain capabilities, allocate or
1800  * reallocate a buffer with additional @size, filling in @id and @version
1801  * of the capability.  A pointer to the new capability is returned.
1802  *
1803  * NB. The chain is based at the head of the buffer, so new entries are
1804  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1805  * next offsets prior to copying to the user buffer.
1806  */
1807 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1808                                                size_t size, u16 id, u16 version)
1809 {
1810         void *buf;
1811         struct vfio_info_cap_header *header, *tmp;
1812
1813         buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1814         if (!buf) {
1815                 kfree(caps->buf);
1816                 caps->size = 0;
1817                 return ERR_PTR(-ENOMEM);
1818         }
1819
1820         caps->buf = buf;
1821         header = buf + caps->size;
1822
1823         /* Eventually copied to user buffer, zero */
1824         memset(header, 0, size);
1825
1826         header->id = id;
1827         header->version = version;
1828
1829         /* Add to the end of the capability chain */
1830         for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1831                 ; /* nothing */
1832
1833         tmp->next = caps->size;
1834         caps->size += size;
1835
1836         return header;
1837 }
1838 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1839
1840 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1841 {
1842         struct vfio_info_cap_header *tmp;
1843         void *buf = (void *)caps->buf;
1844
1845         for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1846                 tmp->next += offset;
1847 }
1848 EXPORT_SYMBOL(vfio_info_cap_shift);
1849
1850 static int sparse_mmap_cap(struct vfio_info_cap *caps, void *cap_type)
1851 {
1852         struct vfio_info_cap_header *header;
1853         struct vfio_region_info_cap_sparse_mmap *sparse_cap, *sparse = cap_type;
1854         size_t size;
1855
1856         size = sizeof(*sparse) + sparse->nr_areas *  sizeof(*sparse->areas);
1857         header = vfio_info_cap_add(caps, size,
1858                                    VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1);
1859         if (IS_ERR(header))
1860                 return PTR_ERR(header);
1861
1862         sparse_cap = container_of(header,
1863                         struct vfio_region_info_cap_sparse_mmap, header);
1864         sparse_cap->nr_areas = sparse->nr_areas;
1865         memcpy(sparse_cap->areas, sparse->areas,
1866                sparse->nr_areas * sizeof(*sparse->areas));
1867         return 0;
1868 }
1869
1870 static int region_type_cap(struct vfio_info_cap *caps, void *cap_type)
1871 {
1872         struct vfio_info_cap_header *header;
1873         struct vfio_region_info_cap_type *type_cap, *cap = cap_type;
1874
1875         header = vfio_info_cap_add(caps, sizeof(*cap),
1876                                    VFIO_REGION_INFO_CAP_TYPE, 1);
1877         if (IS_ERR(header))
1878                 return PTR_ERR(header);
1879
1880         type_cap = container_of(header, struct vfio_region_info_cap_type,
1881                                 header);
1882         type_cap->type = cap->type;
1883         type_cap->subtype = cap->subtype;
1884         return 0;
1885 }
1886
1887 int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id,
1888                              void *cap_type)
1889 {
1890         int ret = -EINVAL;
1891
1892         if (!cap_type)
1893                 return 0;
1894
1895         switch (cap_type_id) {
1896         case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
1897                 ret = sparse_mmap_cap(caps, cap_type);
1898                 break;
1899
1900         case VFIO_REGION_INFO_CAP_TYPE:
1901                 ret = region_type_cap(caps, cap_type);
1902                 break;
1903         }
1904
1905         return ret;
1906 }
1907 EXPORT_SYMBOL(vfio_info_add_capability);
1908
1909 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1910                                        int max_irq_type, size_t *data_size)
1911 {
1912         unsigned long minsz;
1913         size_t size;
1914
1915         minsz = offsetofend(struct vfio_irq_set, count);
1916
1917         if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1918             (hdr->count >= (U32_MAX - hdr->start)) ||
1919             (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1920                                 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1921                 return -EINVAL;
1922
1923         if (data_size)
1924                 *data_size = 0;
1925
1926         if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1927                 return -EINVAL;
1928
1929         switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1930         case VFIO_IRQ_SET_DATA_NONE:
1931                 size = 0;
1932                 break;
1933         case VFIO_IRQ_SET_DATA_BOOL:
1934                 size = sizeof(uint8_t);
1935                 break;
1936         case VFIO_IRQ_SET_DATA_EVENTFD:
1937                 size = sizeof(int32_t);
1938                 break;
1939         default:
1940                 return -EINVAL;
1941         }
1942
1943         if (size) {
1944                 if (hdr->argsz - minsz < hdr->count * size)
1945                         return -EINVAL;
1946
1947                 if (!data_size)
1948                         return -EINVAL;
1949
1950                 *data_size = hdr->count * size;
1951         }
1952
1953         return 0;
1954 }
1955 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1956
1957 /*
1958  * Pin a set of guest PFNs and return their associated host PFNs for local
1959  * domain only.
1960  * @dev [in]     : device
1961  * @user_pfn [in]: array of user/guest PFNs to be pinned.
1962  * @npage [in]   : count of elements in user_pfn array.  This count should not
1963  *                 be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1964  * @prot [in]    : protection flags
1965  * @phys_pfn[out]: array of host PFNs
1966  * Return error or number of pages pinned.
1967  */
1968 int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1969                    int prot, unsigned long *phys_pfn)
1970 {
1971         struct vfio_container *container;
1972         struct vfio_group *group;
1973         struct vfio_iommu_driver *driver;
1974         int ret;
1975
1976         if (!dev || !user_pfn || !phys_pfn || !npage)
1977                 return -EINVAL;
1978
1979         if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1980                 return -E2BIG;
1981
1982         group = vfio_group_get_from_dev(dev);
1983         if (!group)
1984                 return -ENODEV;
1985
1986         ret = vfio_group_add_container_user(group);
1987         if (ret)
1988                 goto err_pin_pages;
1989
1990         container = group->container;
1991         driver = container->iommu_driver;
1992         if (likely(driver && driver->ops->pin_pages))
1993                 ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
1994                                              npage, prot, phys_pfn);
1995         else
1996                 ret = -ENOTTY;
1997
1998         vfio_group_try_dissolve_container(group);
1999
2000 err_pin_pages:
2001         vfio_group_put(group);
2002         return ret;
2003 }
2004 EXPORT_SYMBOL(vfio_pin_pages);
2005
2006 /*
2007  * Unpin set of host PFNs for local domain only.
2008  * @dev [in]     : device
2009  * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
2010  *                 PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2011  * @npage [in]   : count of elements in user_pfn array.  This count should not
2012  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2013  * Return error or number of pages unpinned.
2014  */
2015 int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
2016 {
2017         struct vfio_container *container;
2018         struct vfio_group *group;
2019         struct vfio_iommu_driver *driver;
2020         int ret;
2021
2022         if (!dev || !user_pfn || !npage)
2023                 return -EINVAL;
2024
2025         if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2026                 return -E2BIG;
2027
2028         group = vfio_group_get_from_dev(dev);
2029         if (!group)
2030                 return -ENODEV;
2031
2032         ret = vfio_group_add_container_user(group);
2033         if (ret)
2034                 goto err_unpin_pages;
2035
2036         container = group->container;
2037         driver = container->iommu_driver;
2038         if (likely(driver && driver->ops->unpin_pages))
2039                 ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
2040                                                npage);
2041         else
2042                 ret = -ENOTTY;
2043
2044         vfio_group_try_dissolve_container(group);
2045
2046 err_unpin_pages:
2047         vfio_group_put(group);
2048         return ret;
2049 }
2050 EXPORT_SYMBOL(vfio_unpin_pages);
2051
2052 static int vfio_register_iommu_notifier(struct vfio_group *group,
2053                                         unsigned long *events,
2054                                         struct notifier_block *nb)
2055 {
2056         struct vfio_container *container;
2057         struct vfio_iommu_driver *driver;
2058         int ret;
2059
2060         ret = vfio_group_add_container_user(group);
2061         if (ret)
2062                 return -EINVAL;
2063
2064         container = group->container;
2065         driver = container->iommu_driver;
2066         if (likely(driver && driver->ops->register_notifier))
2067                 ret = driver->ops->register_notifier(container->iommu_data,
2068                                                      events, nb);
2069         else
2070                 ret = -ENOTTY;
2071
2072         vfio_group_try_dissolve_container(group);
2073
2074         return ret;
2075 }
2076
2077 static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2078                                           struct notifier_block *nb)
2079 {
2080         struct vfio_container *container;
2081         struct vfio_iommu_driver *driver;
2082         int ret;
2083
2084         ret = vfio_group_add_container_user(group);
2085         if (ret)
2086                 return -EINVAL;
2087
2088         container = group->container;
2089         driver = container->iommu_driver;
2090         if (likely(driver && driver->ops->unregister_notifier))
2091                 ret = driver->ops->unregister_notifier(container->iommu_data,
2092                                                        nb);
2093         else
2094                 ret = -ENOTTY;
2095
2096         vfio_group_try_dissolve_container(group);
2097
2098         return ret;
2099 }
2100
2101 void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2102 {
2103         group->kvm = kvm;
2104         blocking_notifier_call_chain(&group->notifier,
2105                                 VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2106 }
2107 EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2108
2109 static int vfio_register_group_notifier(struct vfio_group *group,
2110                                         unsigned long *events,
2111                                         struct notifier_block *nb)
2112 {
2113         int ret;
2114         bool set_kvm = false;
2115
2116         if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2117                 set_kvm = true;
2118
2119         /* clear known events */
2120         *events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2121
2122         /* refuse to continue if still events remaining */
2123         if (*events)
2124                 return -EINVAL;
2125
2126         ret = vfio_group_add_container_user(group);
2127         if (ret)
2128                 return -EINVAL;
2129
2130         ret = blocking_notifier_chain_register(&group->notifier, nb);
2131
2132         /*
2133          * The attaching of kvm and vfio_group might already happen, so
2134          * here we replay once upon registration.
2135          */
2136         if (!ret && set_kvm && group->kvm)
2137                 blocking_notifier_call_chain(&group->notifier,
2138                                         VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2139
2140         vfio_group_try_dissolve_container(group);
2141
2142         return ret;
2143 }
2144
2145 static int vfio_unregister_group_notifier(struct vfio_group *group,
2146                                          struct notifier_block *nb)
2147 {
2148         int ret;
2149
2150         ret = vfio_group_add_container_user(group);
2151         if (ret)
2152                 return -EINVAL;
2153
2154         ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2155
2156         vfio_group_try_dissolve_container(group);
2157
2158         return ret;
2159 }
2160
2161 int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2162                            unsigned long *events, struct notifier_block *nb)
2163 {
2164         struct vfio_group *group;
2165         int ret;
2166
2167         if (!dev || !nb || !events || (*events == 0))
2168                 return -EINVAL;
2169
2170         group = vfio_group_get_from_dev(dev);
2171         if (!group)
2172                 return -ENODEV;
2173
2174         switch (type) {
2175         case VFIO_IOMMU_NOTIFY:
2176                 ret = vfio_register_iommu_notifier(group, events, nb);
2177                 break;
2178         case VFIO_GROUP_NOTIFY:
2179                 ret = vfio_register_group_notifier(group, events, nb);
2180                 break;
2181         default:
2182                 ret = -EINVAL;
2183         }
2184
2185         vfio_group_put(group);
2186         return ret;
2187 }
2188 EXPORT_SYMBOL(vfio_register_notifier);
2189
2190 int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2191                              struct notifier_block *nb)
2192 {
2193         struct vfio_group *group;
2194         int ret;
2195
2196         if (!dev || !nb)
2197                 return -EINVAL;
2198
2199         group = vfio_group_get_from_dev(dev);
2200         if (!group)
2201                 return -ENODEV;
2202
2203         switch (type) {
2204         case VFIO_IOMMU_NOTIFY:
2205                 ret = vfio_unregister_iommu_notifier(group, nb);
2206                 break;
2207         case VFIO_GROUP_NOTIFY:
2208                 ret = vfio_unregister_group_notifier(group, nb);
2209                 break;
2210         default:
2211                 ret = -EINVAL;
2212         }
2213
2214         vfio_group_put(group);
2215         return ret;
2216 }
2217 EXPORT_SYMBOL(vfio_unregister_notifier);
2218
2219 /**
2220  * Module/class support
2221  */
2222 static char *vfio_devnode(struct device *dev, umode_t *mode)
2223 {
2224         return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2225 }
2226
2227 static struct miscdevice vfio_dev = {
2228         .minor = VFIO_MINOR,
2229         .name = "vfio",
2230         .fops = &vfio_fops,
2231         .nodename = "vfio/vfio",
2232         .mode = S_IRUGO | S_IWUGO,
2233 };
2234
2235 static int __init vfio_init(void)
2236 {
2237         int ret;
2238
2239         idr_init(&vfio.group_idr);
2240         mutex_init(&vfio.group_lock);
2241         mutex_init(&vfio.iommu_drivers_lock);
2242         INIT_LIST_HEAD(&vfio.group_list);
2243         INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2244         init_waitqueue_head(&vfio.release_q);
2245
2246         ret = misc_register(&vfio_dev);
2247         if (ret) {
2248                 pr_err("vfio: misc device register failed\n");
2249                 return ret;
2250         }
2251
2252         /* /dev/vfio/$GROUP */
2253         vfio.class = class_create(THIS_MODULE, "vfio");
2254         if (IS_ERR(vfio.class)) {
2255                 ret = PTR_ERR(vfio.class);
2256                 goto err_class;
2257         }
2258
2259         vfio.class->devnode = vfio_devnode;
2260
2261         ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK, "vfio");
2262         if (ret)
2263                 goto err_alloc_chrdev;
2264
2265         cdev_init(&vfio.group_cdev, &vfio_group_fops);
2266         ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK);
2267         if (ret)
2268                 goto err_cdev_add;
2269
2270         pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2271
2272 #ifdef CONFIG_VFIO_NOIOMMU
2273         vfio_register_iommu_driver(&vfio_noiommu_ops);
2274 #endif
2275         return 0;
2276
2277 err_cdev_add:
2278         unregister_chrdev_region(vfio.group_devt, MINORMASK);
2279 err_alloc_chrdev:
2280         class_destroy(vfio.class);
2281         vfio.class = NULL;
2282 err_class:
2283         misc_deregister(&vfio_dev);
2284         return ret;
2285 }
2286
2287 static void __exit vfio_cleanup(void)
2288 {
2289         WARN_ON(!list_empty(&vfio.group_list));
2290
2291 #ifdef CONFIG_VFIO_NOIOMMU
2292         vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2293 #endif
2294         idr_destroy(&vfio.group_idr);
2295         cdev_del(&vfio.group_cdev);
2296         unregister_chrdev_region(vfio.group_devt, MINORMASK);
2297         class_destroy(vfio.class);
2298         vfio.class = NULL;
2299         misc_deregister(&vfio_dev);
2300 }
2301
2302 module_init(vfio_init);
2303 module_exit(vfio_cleanup);
2304
2305 MODULE_VERSION(DRIVER_VERSION);
2306 MODULE_LICENSE("GPL v2");
2307 MODULE_AUTHOR(DRIVER_AUTHOR);
2308 MODULE_DESCRIPTION(DRIVER_DESC);
2309 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2310 MODULE_ALIAS("devname:vfio/vfio");
2311 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");