vfio.c

   1 /*
   2  * VFIO core
   3  *
   4  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   5  *     Author: Alex Williamson <alex.williamson@redhat.com>
   6  *
   7  * This program is free software; you can redistribute it and/or modify
   8  * it under the terms of the GNU General Public License version 2 as
   9  * published by the Free Software Foundation.
  10  *
  11  * Derived from original vfio:
  12  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  13  * Author: Tom Lyon, pugs@cisco.com
  14  */
  15
  16 #include <linux/cdev.h>
  17 #include <linux/compat.h>
  18 #include <linux/device.h>
  19 #include <linux/file.h>
  20 #include <linux/anon_inodes.h>
  21 #include <linux/fs.h>
  22 #include <linux/idr.h>
  23 #include <linux/iommu.h>
  24 #include <linux/list.h>
  25 #include <linux/miscdevice.h>
  26 #include <linux/module.h>
  27 #include <linux/mutex.h>
  28 #include <linux/pci.h>
  29 #include <linux/rwsem.h>
  30 #include <linux/sched.h>
  31 #include <linux/slab.h>
  32 #include <linux/stat.h>
  33 #include <linux/string.h>
  34 #include <linux/uaccess.h>
  35 #include <linux/vfio.h>
  36 #include <linux/wait.h>
  37 #include <linux/sched/signal.h>
  38
  39 #define DRIVER_VERSION  "0.3"
  40 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  41 #define DRIVER_DESC     "VFIO - User Level meta-driver"
  42
  43 static struct vfio {
  44         struct class                    *class;
  45         struct list_head                iommu_drivers_list;
  46         struct mutex                    iommu_drivers_lock;
  47         struct list_head                group_list;
  48         struct idr                      group_idr;
  49         struct mutex                    group_lock;
  50         struct cdev                     group_cdev;
  51         dev_t                           group_devt;
  52         wait_queue_head_t               release_q;
  53 } vfio;
  54
  55 struct vfio_iommu_driver {
  56         const struct vfio_iommu_driver_ops      *ops;
  57         struct list_head                        vfio_next;
  58 };
  59
  60 struct vfio_container {
  61         struct kref                     kref;
  62         struct list_head                group_list;
  63         struct rw_semaphore             group_lock;
  64         struct vfio_iommu_driver        *iommu_driver;
  65         void                            *iommu_data;
  66         bool                            noiommu;
  67 };
  68
  69 struct vfio_unbound_dev {
  70         struct device                   *dev;
  71         struct list_head                unbound_next;
  72 };
  73
  74 struct vfio_group {
  75         struct kref                     kref;
  76         int                             minor;
  77         atomic_t                        container_users;
  78         struct iommu_group              *iommu_group;
  79         struct vfio_container           *container;
  80         struct list_head                device_list;
  81         struct mutex                    device_lock;
  82         struct device                   *dev;
  83         struct notifier_block           nb;
  84         struct list_head                vfio_next;
  85         struct list_head                container_next;
  86         struct list_head                unbound_list;
  87         struct mutex                    unbound_lock;
  88         atomic_t                        opened;
  89         wait_queue_head_t               container_q;
  90         bool                            noiommu;
  91         struct kvm                      *kvm;
  92         struct blocking_notifier_head   notifier;
  93 };
  94
  95 struct vfio_device {
  96         struct kref                     kref;
  97         struct device                   *dev;
  98         const struct vfio_device_ops    *ops;
  99         struct vfio_group               *group;
 100         struct list_head                group_next;
 101         void                            *device_data;
 102 };
 103
 104 #ifdef CONFIG_VFIO_NOIOMMU
 105 static bool noiommu __read_mostly;
 106 module_param_named(enable_unsafe_noiommu_mode,
 107                    noiommu, bool, S_IRUGO | S_IWUSR);
 108 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
 109 #endif
 110
 111 /*
 112  * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
 113  * and remove functions, any use cases other than acquiring the first
 114  * reference for the purpose of calling vfio_add_group_dev() or removing
 115  * that symmetric reference after vfio_del_group_dev() should use the raw
 116  * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
 117  * removes the device from the dummy group and cannot be nested.
 118  */
 119 struct iommu_group *vfio_iommu_group_get(struct device *dev)
 120 {
 121         struct iommu_group *group;
 122         int __maybe_unused ret;
 123
 124         group = iommu_group_get(dev);
 125
 126 #ifdef CONFIG_VFIO_NOIOMMU
 127         /*
 128          * With noiommu enabled, an IOMMU group will be created for a device
 129          * that doesn't already have one and doesn't have an iommu_ops on their
 130          * bus.  We set iommudata simply to be able to identify these groups
 131          * as special use and for reclamation later.
 132          */
 133         if (group || !noiommu || iommu_present(dev->bus))
 134                 return group;
 135
 136         group = iommu_group_alloc();
 137         if (IS_ERR(group))
 138                 return NULL;
 139
 140         iommu_group_set_name(group, "vfio-noiommu");
 141         iommu_group_set_iommudata(group, &noiommu, NULL);
 142         ret = iommu_group_add_device(group, dev);
 143         if (ret) {
 144                 iommu_group_put(group);
 145                 return NULL;
 146         }
 147
 148         /*
 149          * Where to taint?  At this point we've added an IOMMU group for a
 150          * device that is not backed by iommu_ops, therefore any iommu_
 151          * callback using iommu_ops can legitimately Oops.  So, while we may
 152          * be about to give a DMA capable device to a user without IOMMU
 153          * protection, which is clearly taint-worthy, let's go ahead and do
 154          * it here.
 155          */
 156         add_taint(TAINT_USER, LOCKDEP_STILL_OK);
 157         dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
 158 #endif
 159
 160         return group;
 161 }
 162 EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
 163
 164 void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
 165 {
 166 #ifdef CONFIG_VFIO_NOIOMMU
 167         if (iommu_group_get_iommudata(group) == &noiommu)
 168                 iommu_group_remove_device(dev);
 169 #endif
 170
 171         iommu_group_put(group);
 172 }
 173 EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
 174
 175 #ifdef CONFIG_VFIO_NOIOMMU
 176 static void *vfio_noiommu_open(unsigned long arg)
 177 {
 178         if (arg != VFIO_NOIOMMU_IOMMU)
 179                 return ERR_PTR(-EINVAL);
 180         if (!capable(CAP_SYS_RAWIO))
 181                 return ERR_PTR(-EPERM);
 182
 183         return NULL;
 184 }
 185
 186 static void vfio_noiommu_release(void *iommu_data)
 187 {
 188 }
 189
 190 static long vfio_noiommu_ioctl(void *iommu_data,
 191                                unsigned int cmd, unsigned long arg)
 192 {
 193         if (cmd == VFIO_CHECK_EXTENSION)
 194                 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
 195
 196         return -ENOTTY;
 197 }
 198
 199 static int vfio_noiommu_attach_group(void *iommu_data,
 200                                      struct iommu_group *iommu_group)
 201 {
 202         return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
 203 }
 204
 205 static void vfio_noiommu_detach_group(void *iommu_data,
 206                                       struct iommu_group *iommu_group)
 207 {
 208 }
 209
 210 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
 211         .name = "vfio-noiommu",
 212         .owner = THIS_MODULE,
 213         .open = vfio_noiommu_open,
 214         .release = vfio_noiommu_release,
 215         .ioctl = vfio_noiommu_ioctl,
 216         .attach_group = vfio_noiommu_attach_group,
 217         .detach_group = vfio_noiommu_detach_group,
 218 };
 219 #endif
 220
 221
 222 /**
 223  * IOMMU driver registration
 224  */
 225 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 226 {
 227         struct vfio_iommu_driver *driver, *tmp;
 228
 229         driver = kzalloc(sizeof(*driver), GFP_KERNEL);
 230         if (!driver)
 231                 return -ENOMEM;
 232
 233         driver->ops = ops;
 234
 235         mutex_lock(&vfio.iommu_drivers_lock);
 236
 237         /* Check for duplicates */
 238         list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
 239                 if (tmp->ops == ops) {
 240                         mutex_unlock(&vfio.iommu_drivers_lock);
 241                         kfree(driver);
 242                         return -EINVAL;
 243                 }
 244         }
 245
 246         list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
 247
 248         mutex_unlock(&vfio.iommu_drivers_lock);
 249
 250         return 0;
 251 }
 252 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
 253
 254 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 255 {
 256         struct vfio_iommu_driver *driver;
 257
 258         mutex_lock(&vfio.iommu_drivers_lock);
 259         list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
 260                 if (driver->ops == ops) {
 261                         list_del(&driver->vfio_next);
 262                         mutex_unlock(&vfio.iommu_drivers_lock);
 263                         kfree(driver);
 264                         return;
 265                 }
 266         }
 267         mutex_unlock(&vfio.iommu_drivers_lock);
 268 }
 269 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
 270
 271 /**
 272  * Group minor allocation/free - both called with vfio.group_lock held
 273  */
 274 static int vfio_alloc_group_minor(struct vfio_group *group)
 275 {
 276         return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
 277 }
 278
 279 static void vfio_free_group_minor(int minor)
 280 {
 281         idr_remove(&vfio.group_idr, minor);
 282 }
 283
 284 static int vfio_iommu_group_notifier(struct notifier_block *nb,
 285                                      unsigned long action, void *data);
 286 static void vfio_group_get(struct vfio_group *group);
 287
 288 /**
 289  * Container objects - containers are created when /dev/vfio/vfio is
 290  * opened, but their lifecycle extends until the last user is done, so
 291  * it's freed via kref.  Must support container/group/device being
 292  * closed in any order.
 293  */
 294 static void vfio_container_get(struct vfio_container *container)
 295 {
 296         kref_get(&container->kref);
 297 }
 298
 299 static void vfio_container_release(struct kref *kref)
 300 {
 301         struct vfio_container *container;
 302         container = container_of(kref, struct vfio_container, kref);
 303
 304         kfree(container);
 305 }
 306
 307 static void vfio_container_put(struct vfio_container *container)
 308 {
 309         kref_put(&container->kref, vfio_container_release);
 310 }
 311
 312 static void vfio_group_unlock_and_free(struct vfio_group *group)
 313 {
 314         mutex_unlock(&vfio.group_lock);
 315         /*
 316          * Unregister outside of lock.  A spurious callback is harmless now
 317          * that the group is no longer in vfio.group_list.
 318          */
 319         iommu_group_unregister_notifier(group->iommu_group, &group->nb);
 320         kfree(group);
 321 }
 322
 323 /**
 324  * Group objects - create, release, get, put, search
 325  */
 326 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 327 {
 328         struct vfio_group *group, *tmp;
 329         struct device *dev;
 330         int ret, minor;
 331
 332         group = kzalloc(sizeof(*group), GFP_KERNEL);
 333         if (!group)
 334                 return ERR_PTR(-ENOMEM);
 335
 336         kref_init(&group->kref);
 337         INIT_LIST_HEAD(&group->device_list);
 338         mutex_init(&group->device_lock);
 339         INIT_LIST_HEAD(&group->unbound_list);
 340         mutex_init(&group->unbound_lock);
 341         atomic_set(&group->container_users, 0);
 342         atomic_set(&group->opened, 0);
 343         init_waitqueue_head(&group->container_q);
 344         group->iommu_group = iommu_group;
 345 #ifdef CONFIG_VFIO_NOIOMMU
 346         group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
 347 #endif
 348         BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
 349
 350         group->nb.notifier_call = vfio_iommu_group_notifier;
 351
 352         /*
 353          * blocking notifiers acquire a rwsem around registering and hold
 354          * it around callback.  Therefore, need to register outside of
 355          * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
 356          * do anything unless it can find the group in vfio.group_list, so
 357          * no harm in registering early.
 358          */
 359         ret = iommu_group_register_notifier(iommu_group, &group->nb);
 360         if (ret) {
 361                 kfree(group);
 362                 return ERR_PTR(ret);
 363         }
 364
 365         mutex_lock(&vfio.group_lock);
 366
 367         /* Did we race creating this group? */
 368         list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
 369                 if (tmp->iommu_group == iommu_group) {
 370                         vfio_group_get(tmp);
 371                         vfio_group_unlock_and_free(group);
 372                         return tmp;
 373                 }
 374         }
 375
 376         minor = vfio_alloc_group_minor(group);
 377         if (minor < 0) {
 378                 vfio_group_unlock_and_free(group);
 379                 return ERR_PTR(minor);
 380         }
 381
 382         dev = device_create(vfio.class, NULL,
 383                             MKDEV(MAJOR(vfio.group_devt), minor),
 384                             group, "%s%d", group->noiommu ? "noiommu-" : "",
 385                             iommu_group_id(iommu_group));
 386         if (IS_ERR(dev)) {
 387                 vfio_free_group_minor(minor);
 388                 vfio_group_unlock_and_free(group);
 389                 return ERR_CAST(dev);
 390         }
 391
 392         group->minor = minor;
 393         group->dev = dev;
 394
 395         list_add(&group->vfio_next, &vfio.group_list);
 396
 397         mutex_unlock(&vfio.group_lock);
 398
 399         return group;
 400 }
 401
 402 /* called with vfio.group_lock held */
 403 static void vfio_group_release(struct kref *kref)
 404 {
 405         struct vfio_group *group = container_of(kref, struct vfio_group, kref);
 406         struct vfio_unbound_dev *unbound, *tmp;
 407         struct iommu_group *iommu_group = group->iommu_group;
 408
 409         WARN_ON(!list_empty(&group->device_list));
 410         WARN_ON(group->notifier.head);
 411
 412         list_for_each_entry_safe(unbound, tmp,
 413                                  &group->unbound_list, unbound_next) {
 414                 list_del(&unbound->unbound_next);
 415                 kfree(unbound);
 416         }
 417
 418         device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
 419         list_del(&group->vfio_next);
 420         vfio_free_group_minor(group->minor);
 421         vfio_group_unlock_and_free(group);
 422         iommu_group_put(iommu_group);
 423 }
 424
 425 static void vfio_group_put(struct vfio_group *group)
 426 {
 427         kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
 428 }
 429
 430 struct vfio_group_put_work {
 431         struct work_struct work;
 432         struct vfio_group *group;
 433 };
 434
 435 static void vfio_group_put_bg(struct work_struct *work)
 436 {
 437         struct vfio_group_put_work *do_work;
 438
 439         do_work = container_of(work, struct vfio_group_put_work, work);
 440
 441         vfio_group_put(do_work->group);
 442         kfree(do_work);
 443 }
 444
 445 static void vfio_group_schedule_put(struct vfio_group *group)
 446 {
 447         struct vfio_group_put_work *do_work;
 448
 449         do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
 450         if (WARN_ON(!do_work))
 451                 return;
 452
 453         INIT_WORK(&do_work->work, vfio_group_put_bg);
 454         do_work->group = group;
 455         schedule_work(&do_work->work);
 456 }
 457
 458 /* Assume group_lock or group reference is held */
 459 static void vfio_group_get(struct vfio_group *group)
 460 {
 461         kref_get(&group->kref);
 462 }
 463
 464 /*
 465  * Not really a try as we will sleep for mutex, but we need to make
 466  * sure the group pointer is valid under lock and get a reference.
 467  */
 468 static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
 469 {
 470         struct vfio_group *target = group;
 471
 472         mutex_lock(&vfio.group_lock);
 473         list_for_each_entry(group, &vfio.group_list, vfio_next) {
 474                 if (group == target) {
 475                         vfio_group_get(group);
 476                         mutex_unlock(&vfio.group_lock);
 477                         return group;
 478                 }
 479         }
 480         mutex_unlock(&vfio.group_lock);
 481
 482         return NULL;
 483 }
 484
 485 static
 486 struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
 487 {
 488         struct vfio_group *group;
 489
 490         mutex_lock(&vfio.group_lock);
 491         list_for_each_entry(group, &vfio.group_list, vfio_next) {
 492                 if (group->iommu_group == iommu_group) {
 493                         vfio_group_get(group);
 494                         mutex_unlock(&vfio.group_lock);
 495                         return group;
 496                 }
 497         }
 498         mutex_unlock(&vfio.group_lock);
 499
 500         return NULL;
 501 }
 502
 503 static struct vfio_group *vfio_group_get_from_minor(int minor)
 504 {
 505         struct vfio_group *group;
 506
 507         mutex_lock(&vfio.group_lock);
 508         group = idr_find(&vfio.group_idr, minor);
 509         if (!group) {
 510                 mutex_unlock(&vfio.group_lock);
 511                 return NULL;
 512         }
 513         vfio_group_get(group);
 514         mutex_unlock(&vfio.group_lock);
 515
 516         return group;
 517 }
 518
 519 static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
 520 {
 521         struct iommu_group *iommu_group;
 522         struct vfio_group *group;
 523
 524         iommu_group = iommu_group_get(dev);
 525         if (!iommu_group)
 526                 return NULL;
 527
 528         group = vfio_group_get_from_iommu(iommu_group);
 529         iommu_group_put(iommu_group);
 530
 531         return group;
 532 }
 533
 534 /**
 535  * Device objects - create, release, get, put, search
 536  */
 537 static
 538 struct vfio_device *vfio_group_create_device(struct vfio_group *group,
 539                                              struct device *dev,
 540                                              const struct vfio_device_ops *ops,
 541                                              void *device_data)
 542 {
 543         struct vfio_device *device;
 544
 545         device = kzalloc(sizeof(*device), GFP_KERNEL);
 546         if (!device)
 547                 return ERR_PTR(-ENOMEM);
 548
 549         kref_init(&device->kref);
 550         device->dev = dev;
 551         device->group = group;
 552         device->ops = ops;
 553         device->device_data = device_data;
 554         dev_set_drvdata(dev, device);
 555
 556         /* No need to get group_lock, caller has group reference */
 557         vfio_group_get(group);
 558
 559         mutex_lock(&group->device_lock);
 560         list_add(&device->group_next, &group->device_list);
 561         mutex_unlock(&group->device_lock);
 562
 563         return device;
 564 }
 565
 566 static void vfio_device_release(struct kref *kref)
 567 {
 568         struct vfio_device *device = container_of(kref,
 569                                                   struct vfio_device, kref);
 570         struct vfio_group *group = device->group;
 571
 572         list_del(&device->group_next);
 573         mutex_unlock(&group->device_lock);
 574
 575         dev_set_drvdata(device->dev, NULL);
 576
 577         kfree(device);
 578
 579         /* vfio_del_group_dev may be waiting for this device */
 580         wake_up(&vfio.release_q);
 581 }
 582
 583 /* Device reference always implies a group reference */
 584 void vfio_device_put(struct vfio_device *device)
 585 {
 586         struct vfio_group *group = device->group;
 587         kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
 588         vfio_group_put(group);
 589 }
 590 EXPORT_SYMBOL_GPL(vfio_device_put);
 591
 592 static void vfio_device_get(struct vfio_device *device)
 593 {
 594         vfio_group_get(device->group);
 595         kref_get(&device->kref);
 596 }
 597
 598 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
 599                                                  struct device *dev)
 600 {
 601         struct vfio_device *device;
 602
 603         mutex_lock(&group->device_lock);
 604         list_for_each_entry(device, &group->device_list, group_next) {
 605                 if (device->dev == dev) {
 606                         vfio_device_get(device);
 607                         mutex_unlock(&group->device_lock);
 608                         return device;
 609                 }
 610         }
 611         mutex_unlock(&group->device_lock);
 612         return NULL;
 613 }
 614
 615 /*
 616  * Some drivers, like pci-stub, are only used to prevent other drivers from
 617  * claiming a device and are therefore perfectly legitimate for a user owned
 618  * group.  The pci-stub driver has no dependencies on DMA or the IOVA mapping
 619  * of the device, but it does prevent the user from having direct access to
 620  * the device, which is useful in some circumstances.
 621  *
 622  * We also assume that we can include PCI interconnect devices, ie. bridges.
 623  * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
 624  * then all of the downstream devices will be part of the same IOMMU group as
 625  * the bridge.  Thus, if placing the bridge into the user owned IOVA space
 626  * breaks anything, it only does so for user owned devices downstream.  Note
 627  * that error notification via MSI can be affected for platforms that handle
 628  * MSI within the same IOVA space as DMA.
 629  */
 630 static const char * const vfio_driver_whitelist[] = { "pci-stub" };
 631
 632 static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
 633 {
 634         int i;
 635
 636         if (dev_is_pci(dev)) {
 637                 struct pci_dev *pdev = to_pci_dev(dev);
 638
 639                 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
 640                         return true;
 641         }
 642
 643         for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) {
 644                 if (!strcmp(drv->name, vfio_driver_whitelist[i]))
 645                         return true;
 646         }
 647
 648         return false;
 649 }
 650
 651 /*
 652  * A vfio group is viable for use by userspace if all devices are in
 653  * one of the following states:
 654  *  - driver-less
 655  *  - bound to a vfio driver
 656  *  - bound to a whitelisted driver
 657  *  - a PCI interconnect device
 658  *
 659  * We use two methods to determine whether a device is bound to a vfio
 660  * driver.  The first is to test whether the device exists in the vfio
 661  * group.  The second is to test if the device exists on the group
 662  * unbound_list, indicating it's in the middle of transitioning from
 663  * a vfio driver to driver-less.
 664  */
 665 static int vfio_dev_viable(struct device *dev, void *data)
 666 {
 667         struct vfio_group *group = data;
 668         struct vfio_device *device;
 669         struct device_driver *drv = ACCESS_ONCE(dev->driver);
 670         struct vfio_unbound_dev *unbound;
 671         int ret = -EINVAL;
 672
 673         mutex_lock(&group->unbound_lock);
 674         list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
 675                 if (dev == unbound->dev) {
 676                         ret = 0;
 677                         break;
 678                 }
 679         }
 680         mutex_unlock(&group->unbound_lock);
 681
 682         if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
 683                 return 0;
 684
 685         device = vfio_group_get_device(group, dev);
 686         if (device) {
 687                 vfio_device_put(device);
 688                 return 0;
 689         }
 690
 691         return ret;
 692 }
 693
 694 /**
 695  * Async device support
 696  */
 697 static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
 698 {
 699         struct vfio_device *device;
 700
 701         /* Do we already know about it?  We shouldn't */
 702         device = vfio_group_get_device(group, dev);
 703         if (WARN_ON_ONCE(device)) {
 704                 vfio_device_put(device);
 705                 return 0;
 706         }
 707
 708         /* Nothing to do for idle groups */
 709         if (!atomic_read(&group->container_users))
 710                 return 0;
 711
 712         /* TODO Prevent device auto probing */
 713         WARN(1, "Device %s added to live group %d!\n", dev_name(dev),
 714              iommu_group_id(group->iommu_group));
 715
 716         return 0;
 717 }
 718
 719 static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
 720 {
 721         /* We don't care what happens when the group isn't in use */
 722         if (!atomic_read(&group->container_users))
 723                 return 0;
 724
 725         return vfio_dev_viable(dev, group);
 726 }
 727
 728 static int vfio_iommu_group_notifier(struct notifier_block *nb,
 729                                      unsigned long action, void *data)
 730 {
 731         struct vfio_group *group = container_of(nb, struct vfio_group, nb);
 732         struct device *dev = data;
 733         struct vfio_unbound_dev *unbound;
 734
 735         /*
 736          * Need to go through a group_lock lookup to get a reference or we
 737          * risk racing a group being removed.  Ignore spurious notifies.
 738          */
 739         group = vfio_group_try_get(group);
 740         if (!group)
 741                 return NOTIFY_OK;
 742
 743         switch (action) {
 744         case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
 745                 vfio_group_nb_add_dev(group, dev);
 746                 break;
 747         case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
 748                 /*
 749                  * Nothing to do here.  If the device is in use, then the
 750                  * vfio sub-driver should block the remove callback until
 751                  * it is unused.  If the device is unused or attached to a
 752                  * stub driver, then it should be released and we don't
 753                  * care that it will be going away.
 754                  */
 755                 break;
 756         case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
 757                 pr_debug("%s: Device %s, group %d binding to driver\n",
 758                          __func__, dev_name(dev),
 759                          iommu_group_id(group->iommu_group));
 760                 break;
 761         case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
 762                 pr_debug("%s: Device %s, group %d bound to driver %s\n",
 763                          __func__, dev_name(dev),
 764                          iommu_group_id(group->iommu_group), dev->driver->name);
 765                 BUG_ON(vfio_group_nb_verify(group, dev));
 766                 break;
 767         case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
 768                 pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
 769                          __func__, dev_name(dev),
 770                          iommu_group_id(group->iommu_group), dev->driver->name);
 771                 break;
 772         case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
 773                 pr_debug("%s: Device %s, group %d unbound from driver\n",
 774                          __func__, dev_name(dev),
 775                          iommu_group_id(group->iommu_group));
 776                 /*
 777                  * XXX An unbound device in a live group is ok, but we'd
 778                  * really like to avoid the above BUG_ON by preventing other
 779                  * drivers from binding to it.  Once that occurs, we have to
 780                  * stop the system to maintain isolation.  At a minimum, we'd
 781                  * want a toggle to disable driver auto probe for this device.
 782                  */
 783
 784                 mutex_lock(&group->unbound_lock);
 785                 list_for_each_entry(unbound,
 786                                     &group->unbound_list, unbound_next) {
 787                         if (dev == unbound->dev) {
 788                                 list_del(&unbound->unbound_next);
 789                                 kfree(unbound);
 790                                 break;
 791                         }
 792                 }
 793                 mutex_unlock(&group->unbound_lock);
 794                 break;
 795         }
 796
 797         /*
 798          * If we're the last reference to the group, the group will be
 799          * released, which includes unregistering the iommu group notifier.
 800          * We hold a read-lock on that notifier list, unregistering needs
 801          * a write-lock... deadlock.  Release our reference asynchronously
 802          * to avoid that situation.
 803          */
 804         vfio_group_schedule_put(group);
 805         return NOTIFY_OK;
 806 }
 807
 808 /**
 809  * VFIO driver API
 810  */
 811 int vfio_add_group_dev(struct device *dev,
 812                        const struct vfio_device_ops *ops, void *device_data)
 813 {
 814         struct iommu_group *iommu_group;
 815         struct vfio_group *group;
 816         struct vfio_device *device;
 817
 818         iommu_group = iommu_group_get(dev);
 819         if (!iommu_group)
 820                 return -EINVAL;
 821
 822         group = vfio_group_get_from_iommu(iommu_group);
 823         if (!group) {
 824                 group = vfio_create_group(iommu_group);
 825                 if (IS_ERR(group)) {
 826                         iommu_group_put(iommu_group);
 827                         return PTR_ERR(group);
 828                 }
 829         } else {
 830                 /*
 831                  * A found vfio_group already holds a reference to the
 832                  * iommu_group.  A created vfio_group keeps the reference.
 833                  */
 834                 iommu_group_put(iommu_group);
 835         }
 836
 837         device = vfio_group_get_device(group, dev);
 838         if (device) {
 839                 WARN(1, "Device %s already exists on group %d\n",
 840                      dev_name(dev), iommu_group_id(iommu_group));
 841                 vfio_device_put(device);
 842                 vfio_group_put(group);
 843                 return -EBUSY;
 844         }
 845
 846         device = vfio_group_create_device(group, dev, ops, device_data);
 847         if (IS_ERR(device)) {
 848                 vfio_group_put(group);
 849                 return PTR_ERR(device);
 850         }
 851
 852         /*
 853          * Drop all but the vfio_device reference.  The vfio_device holds
 854          * a reference to the vfio_group, which holds a reference to the
 855          * iommu_group.
 856          */
 857         vfio_group_put(group);
 858
 859         return 0;
 860 }
 861 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
 862
 863 /**
 864  * Get a reference to the vfio_device for a device.  Even if the
 865  * caller thinks they own the device, they could be racing with a
 866  * release call path, so we can't trust drvdata for the shortcut.
 867  * Go the long way around, from the iommu_group to the vfio_group
 868  * to the vfio_device.
 869  */
 870 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
 871 {
 872         struct vfio_group *group;
 873         struct vfio_device *device;
 874
 875         group = vfio_group_get_from_dev(dev);
 876         if (!group)
 877                 return NULL;
 878
 879         device = vfio_group_get_device(group, dev);
 880         vfio_group_put(group);
 881
 882         return device;
 883 }
 884 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
 885
 886 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 887                                                      char *buf)
 888 {
 889         struct vfio_device *it, *device = NULL;
 890
 891         mutex_lock(&group->device_lock);
 892         list_for_each_entry(it, &group->device_list, group_next) {
 893                 if (!strcmp(dev_name(it->dev), buf)) {
 894                         device = it;
 895                         vfio_device_get(device);
 896                         break;
 897                 }
 898         }
 899         mutex_unlock(&group->device_lock);
 900
 901         return device;
 902 }
 903
 904 /*
 905  * Caller must hold a reference to the vfio_device
 906  */
 907 void *vfio_device_data(struct vfio_device *device)
 908 {
 909         return device->device_data;
 910 }
 911 EXPORT_SYMBOL_GPL(vfio_device_data);
 912
 913 /*
 914  * Decrement the device reference count and wait for the device to be
 915  * removed.  Open file descriptors for the device... */
 916 void *vfio_del_group_dev(struct device *dev)
 917 {
 918         DEFINE_WAIT_FUNC(wait, woken_wake_function);
 919         struct vfio_device *device = dev_get_drvdata(dev);
 920         struct vfio_group *group = device->group;
 921         void *device_data = device->device_data;
 922         struct vfio_unbound_dev *unbound;
 923         unsigned int i = 0;
 924         bool interrupted = false;
 925
 926         /*
 927          * The group exists so long as we have a device reference.  Get
 928          * a group reference and use it to scan for the device going away.
 929          */
 930         vfio_group_get(group);
 931
 932         /*
 933          * When the device is removed from the group, the group suddenly
 934          * becomes non-viable; the device has a driver (until the unbind
 935          * completes), but it's not present in the group.  This is bad news
 936          * for any external users that need to re-acquire a group reference
 937          * in order to match and release their existing reference.  To
 938          * solve this, we track such devices on the unbound_list to bridge
 939          * the gap until they're fully unbound.
 940          */
 941         unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
 942         if (unbound) {
 943                 unbound->dev = dev;
 944                 mutex_lock(&group->unbound_lock);
 945                 list_add(&unbound->unbound_next, &group->unbound_list);
 946                 mutex_unlock(&group->unbound_lock);
 947         }
 948         WARN_ON(!unbound);
 949
 950         vfio_device_put(device);
 951
 952         /*
 953          * If the device is still present in the group after the above
 954          * 'put', then it is in use and we need to request it from the
 955          * bus driver.  The driver may in turn need to request the
 956          * device from the user.  We send the request on an arbitrary
 957          * interval with counter to allow the driver to take escalating
 958          * measures to release the device if it has the ability to do so.
 959          */
 960         add_wait_queue(&vfio.release_q, &wait);
 961
 962         do {
 963                 device = vfio_group_get_device(group, dev);
 964                 if (!device)
 965                         break;
 966
 967                 if (device->ops->request)
 968                         device->ops->request(device_data, i++);
 969
 970                 vfio_device_put(device);
 971
 972                 if (interrupted) {
 973                         wait_woken(&wait, TASK_UNINTERRUPTIBLE, HZ * 10);
 974                 } else {
 975                         wait_woken(&wait, TASK_INTERRUPTIBLE, HZ * 10);
 976                         if (signal_pending(current)) {
 977                                 interrupted = true;
 978                                 dev_warn(dev,
 979                                          "Device is currently in use, task"
 980                                          " \"%s\" (%d) "
 981                                          "blocked until device is released",
 982                                          current->comm, task_pid_nr(current));
 983                         }
 984                 }
 985
 986         } while (1);
 987
 988         remove_wait_queue(&vfio.release_q, &wait);
 989         /*
 990          * In order to support multiple devices per group, devices can be
 991          * plucked from the group while other devices in the group are still
 992          * in use.  The container persists with this group and those remaining
 993          * devices still attached.  If the user creates an isolation violation
 994          * by binding this device to another driver while the group is still in
 995          * use, that's their fault.  However, in the case of removing the last,
 996          * or potentially the only, device in the group there can be no other
 997          * in-use devices in the group.  The user has done their due diligence
 998          * and we should lay no claims to those devices.  In order to do that,
 999          * we need to make sure the group is detached from the container.
1000          * Without this stall, we're potentially racing with a user process
1001          * that may attempt to immediately bind this device to another driver.
1002          */
1003         if (list_empty(&group->device_list))
1004                 wait_event(group->container_q, !group->container);
1005
1006         vfio_group_put(group);
1007
1008         return device_data;
1009 }
1010 EXPORT_SYMBOL_GPL(vfio_del_group_dev);
1011
1012 /**
1013  * VFIO base fd, /dev/vfio/vfio
1014  */
1015 static long vfio_ioctl_check_extension(struct vfio_container *container,
1016                                        unsigned long arg)
1017 {
1018         struct vfio_iommu_driver *driver;
1019         long ret = 0;
1020
1021         down_read(&container->group_lock);
1022
1023         driver = container->iommu_driver;
1024
1025         switch (arg) {
1026                 /* No base extensions yet */
1027         default:
1028                 /*
1029                  * If no driver is set, poll all registered drivers for
1030                  * extensions and return the first positive result.  If
1031                  * a driver is already set, further queries will be passed
1032                  * only to that driver.
1033                  */
1034                 if (!driver) {
1035                         mutex_lock(&vfio.iommu_drivers_lock);
1036                         list_for_each_entry(driver, &vfio.iommu_drivers_list,
1037                                             vfio_next) {
1038
1039 #ifdef CONFIG_VFIO_NOIOMMU
1040                                 if (!list_empty(&container->group_list) &&
1041                                     (container->noiommu !=
1042                                      (driver->ops == &vfio_noiommu_ops)))
1043                                         continue;
1044 #endif
1045
1046                                 if (!try_module_get(driver->ops->owner))
1047                                         continue;
1048
1049                                 ret = driver->ops->ioctl(NULL,
1050                                                          VFIO_CHECK_EXTENSION,
1051                                                          arg);
1052                                 module_put(driver->ops->owner);
1053                                 if (ret > 0)
1054                                         break;
1055                         }
1056                         mutex_unlock(&vfio.iommu_drivers_lock);
1057                 } else
1058                         ret = driver->ops->ioctl(container->iommu_data,
1059                                                  VFIO_CHECK_EXTENSION, arg);
1060         }
1061
1062         up_read(&container->group_lock);
1063
1064         return ret;
1065 }
1066
1067 /* hold write lock on container->group_lock */
1068 static int __vfio_container_attach_groups(struct vfio_container *container,
1069                                           struct vfio_iommu_driver *driver,
1070                                           void *data)
1071 {
1072         struct vfio_group *group;
1073         int ret = -ENODEV;
1074
1075         list_for_each_entry(group, &container->group_list, container_next) {
1076                 ret = driver->ops->attach_group(data, group->iommu_group);
1077                 if (ret)
1078                         goto unwind;
1079         }
1080
1081         return ret;
1082
1083 unwind:
1084         list_for_each_entry_continue_reverse(group, &container->group_list,
1085                                              container_next) {
1086                 driver->ops->detach_group(data, group->iommu_group);
1087         }
1088
1089         return ret;
1090 }
1091
1092 static long vfio_ioctl_set_iommu(struct vfio_container *container,
1093                                  unsigned long arg)
1094 {
1095         struct vfio_iommu_driver *driver;
1096         long ret = -ENODEV;
1097
1098         down_write(&container->group_lock);
1099
1100         /*
1101          * The container is designed to be an unprivileged interface while
1102          * the group can be assigned to specific users.  Therefore, only by
1103          * adding a group to a container does the user get the privilege of
1104          * enabling the iommu, which may allocate finite resources.  There
1105          * is no unset_iommu, but by removing all the groups from a container,
1106          * the container is deprivileged and returns to an unset state.
1107          */
1108         if (list_empty(&container->group_list) || container->iommu_driver) {
1109                 up_write(&container->group_lock);
1110                 return -EINVAL;
1111         }
1112
1113         mutex_lock(&vfio.iommu_drivers_lock);
1114         list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1115                 void *data;
1116
1117 #ifdef CONFIG_VFIO_NOIOMMU
1118                 /*
1119                  * Only noiommu containers can use vfio-noiommu and noiommu
1120                  * containers can only use vfio-noiommu.
1121                  */
1122                 if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1123                         continue;
1124 #endif
1125
1126                 if (!try_module_get(driver->ops->owner))
1127                         continue;
1128
1129                 /*
1130                  * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1131                  * so test which iommu driver reported support for this
1132                  * extension and call open on them.  We also pass them the
1133                  * magic, allowing a single driver to support multiple
1134                  * interfaces if they'd like.
1135                  */
1136                 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1137                         module_put(driver->ops->owner);
1138                         continue;
1139                 }
1140
1141                 data = driver->ops->open(arg);
1142                 if (IS_ERR(data)) {
1143                         ret = PTR_ERR(data);
1144                         module_put(driver->ops->owner);
1145                         continue;
1146                 }
1147
1148                 ret = __vfio_container_attach_groups(container, driver, data);
1149                 if (ret) {
1150                         driver->ops->release(data);
1151                         module_put(driver->ops->owner);
1152                         continue;
1153                 }
1154
1155                 container->iommu_driver = driver;
1156                 container->iommu_data = data;
1157                 break;
1158         }
1159
1160         mutex_unlock(&vfio.iommu_drivers_lock);
1161         up_write(&container->group_lock);
1162
1163         return ret;
1164 }
1165
1166 static long vfio_fops_unl_ioctl(struct file *filep,
1167                                 unsigned int cmd, unsigned long arg)
1168 {
1169         struct vfio_container *container = filep->private_data;
1170         struct vfio_iommu_driver *driver;
1171         void *data;
1172         long ret = -EINVAL;
1173
1174         if (!container)
1175                 return ret;
1176
1177         switch (cmd) {
1178         case VFIO_GET_API_VERSION:
1179                 ret = VFIO_API_VERSION;
1180                 break;
1181         case VFIO_CHECK_EXTENSION:
1182                 ret = vfio_ioctl_check_extension(container, arg);
1183                 break;
1184         case VFIO_SET_IOMMU:
1185                 ret = vfio_ioctl_set_iommu(container, arg);
1186                 break;
1187         default:
1188                 driver = container->iommu_driver;
1189                 data = container->iommu_data;
1190
1191                 if (driver) /* passthrough all unrecognized ioctls */
1192                         ret = driver->ops->ioctl(data, cmd, arg);
1193         }
1194
1195         return ret;
1196 }
1197
1198 #ifdef CONFIG_COMPAT
1199 static long vfio_fops_compat_ioctl(struct file *filep,
1200                                    unsigned int cmd, unsigned long arg)
1201 {
1202         arg = (unsigned long)compat_ptr(arg);
1203         return vfio_fops_unl_ioctl(filep, cmd, arg);
1204 }
1205 #endif  /* CONFIG_COMPAT */
1206
1207 static int vfio_fops_open(struct inode *inode, struct file *filep)
1208 {
1209         struct vfio_container *container;
1210
1211         container = kzalloc(sizeof(*container), GFP_KERNEL);
1212         if (!container)
1213                 return -ENOMEM;
1214
1215         INIT_LIST_HEAD(&container->group_list);
1216         init_rwsem(&container->group_lock);
1217         kref_init(&container->kref);
1218
1219         filep->private_data = container;
1220
1221         return 0;
1222 }
1223
1224 static int vfio_fops_release(struct inode *inode, struct file *filep)
1225 {
1226         struct vfio_container *container = filep->private_data;
1227
1228         filep->private_data = NULL;
1229
1230         vfio_container_put(container);
1231
1232         return 0;
1233 }
1234
1235 /*
1236  * Once an iommu driver is set, we optionally pass read/write/mmap
1237  * on to the driver, allowing management interfaces beyond ioctl.
1238  */
1239 static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1240                               size_t count, loff_t *ppos)
1241 {
1242         struct vfio_container *container = filep->private_data;
1243         struct vfio_iommu_driver *driver;
1244         ssize_t ret = -EINVAL;
1245
1246         driver = container->iommu_driver;
1247         if (likely(driver && driver->ops->read))
1248                 ret = driver->ops->read(container->iommu_data,
1249                                         buf, count, ppos);
1250
1251         return ret;
1252 }
1253
1254 static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1255                                size_t count, loff_t *ppos)
1256 {
1257         struct vfio_container *container = filep->private_data;
1258         struct vfio_iommu_driver *driver;
1259         ssize_t ret = -EINVAL;
1260
1261         driver = container->iommu_driver;
1262         if (likely(driver && driver->ops->write))
1263                 ret = driver->ops->write(container->iommu_data,
1264                                          buf, count, ppos);
1265
1266         return ret;
1267 }
1268
1269 static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1270 {
1271         struct vfio_container *container = filep->private_data;
1272         struct vfio_iommu_driver *driver;
1273         int ret = -EINVAL;
1274
1275         driver = container->iommu_driver;
1276         if (likely(driver && driver->ops->mmap))
1277                 ret = driver->ops->mmap(container->iommu_data, vma);
1278
1279         return ret;
1280 }
1281
1282 static const struct file_operations vfio_fops = {
1283         .owner          = THIS_MODULE,
1284         .open           = vfio_fops_open,
1285         .release        = vfio_fops_release,
1286         .read           = vfio_fops_read,
1287         .write          = vfio_fops_write,
1288         .unlocked_ioctl = vfio_fops_unl_ioctl,
1289 #ifdef CONFIG_COMPAT
1290         .compat_ioctl   = vfio_fops_compat_ioctl,
1291 #endif
1292         .mmap           = vfio_fops_mmap,
1293 };
1294
1295 /**
1296  * VFIO Group fd, /dev/vfio/$GROUP
1297  */
1298 static void __vfio_group_unset_container(struct vfio_group *group)
1299 {
1300         struct vfio_container *container = group->container;
1301         struct vfio_iommu_driver *driver;
1302
1303         down_write(&container->group_lock);
1304
1305         driver = container->iommu_driver;
1306         if (driver)
1307                 driver->ops->detach_group(container->iommu_data,
1308                                           group->iommu_group);
1309
1310         group->container = NULL;
1311         wake_up(&group->container_q);
1312         list_del(&group->container_next);
1313
1314         /* Detaching the last group deprivileges a container, remove iommu */
1315         if (driver && list_empty(&container->group_list)) {
1316                 driver->ops->release(container->iommu_data);
1317                 module_put(driver->ops->owner);
1318                 container->iommu_driver = NULL;
1319                 container->iommu_data = NULL;
1320         }
1321
1322         up_write(&container->group_lock);
1323
1324         vfio_container_put(container);
1325 }
1326
1327 /*
1328  * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1329  * if there was no container to unset.  Since the ioctl is called on
1330  * the group, we know that still exists, therefore the only valid
1331  * transition here is 1->0.
1332  */
1333 static int vfio_group_unset_container(struct vfio_group *group)
1334 {
1335         int users = atomic_cmpxchg(&group->container_users, 1, 0);
1336
1337         if (!users)
1338                 return -EINVAL;
1339         if (users != 1)
1340                 return -EBUSY;
1341
1342         __vfio_group_unset_container(group);
1343
1344         return 0;
1345 }
1346
1347 /*
1348  * When removing container users, anything that removes the last user
1349  * implicitly removes the group from the container.  That is, if the
1350  * group file descriptor is closed, as well as any device file descriptors,
1351  * the group is free.
1352  */
1353 static void vfio_group_try_dissolve_container(struct vfio_group *group)
1354 {
1355         if (0 == atomic_dec_if_positive(&group->container_users))
1356                 __vfio_group_unset_container(group);
1357 }
1358
1359 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1360 {
1361         struct fd f;
1362         struct vfio_container *container;
1363         struct vfio_iommu_driver *driver;
1364         int ret = 0;
1365
1366         if (atomic_read(&group->container_users))
1367                 return -EINVAL;
1368
1369         if (group->noiommu && !capable(CAP_SYS_RAWIO))
1370                 return -EPERM;
1371
1372         f = fdget(container_fd);
1373         if (!f.file)
1374                 return -EBADF;
1375
1376         /* Sanity check, is this really our fd? */
1377         if (f.file->f_op != &vfio_fops) {
1378                 fdput(f);
1379                 return -EINVAL;
1380         }
1381
1382         container = f.file->private_data;
1383         WARN_ON(!container); /* fget ensures we don't race vfio_release */
1384
1385         down_write(&container->group_lock);
1386
1387         /* Real groups and fake groups cannot mix */
1388         if (!list_empty(&container->group_list) &&
1389             container->noiommu != group->noiommu) {
1390                 ret = -EPERM;
1391                 goto unlock_out;
1392         }
1393
1394         driver = container->iommu_driver;
1395         if (driver) {
1396                 ret = driver->ops->attach_group(container->iommu_data,
1397                                                 group->iommu_group);
1398                 if (ret)
1399                         goto unlock_out;
1400         }
1401
1402         group->container = container;
1403         container->noiommu = group->noiommu;
1404         list_add(&group->container_next, &container->group_list);
1405
1406         /* Get a reference on the container and mark a user within the group */
1407         vfio_container_get(container);
1408         atomic_inc(&group->container_users);
1409
1410 unlock_out:
1411         up_write(&container->group_lock);
1412         fdput(f);
1413         return ret;
1414 }
1415
1416 static bool vfio_group_viable(struct vfio_group *group)
1417 {
1418         return (iommu_group_for_each_dev(group->iommu_group,
1419                                          group, vfio_dev_viable) == 0);
1420 }
1421
1422 static int vfio_group_add_container_user(struct vfio_group *group)
1423 {
1424         if (!atomic_inc_not_zero(&group->container_users))
1425                 return -EINVAL;
1426
1427         if (group->noiommu) {
1428                 atomic_dec(&group->container_users);
1429                 return -EPERM;
1430         }
1431         if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1432                 atomic_dec(&group->container_users);
1433                 return -EINVAL;
1434         }
1435
1436         return 0;
1437 }
1438
1439 static const struct file_operations vfio_device_fops;
1440
1441 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1442 {
1443         struct vfio_device *device;
1444         struct file *filep;
1445         int ret;
1446
1447         if (0 == atomic_read(&group->container_users) ||
1448             !group->container->iommu_driver || !vfio_group_viable(group))
1449                 return -EINVAL;
1450
1451         if (group->noiommu && !capable(CAP_SYS_RAWIO))
1452                 return -EPERM;
1453
1454         device = vfio_device_get_from_name(group, buf);
1455         if (!device)
1456                 return -ENODEV;
1457
1458         ret = device->ops->open(device->device_data);
1459         if (ret) {
1460                 vfio_device_put(device);
1461                 return ret;
1462         }
1463
1464         /*
1465          * We can't use anon_inode_getfd() because we need to modify
1466          * the f_mode flags directly to allow more than just ioctls
1467          */
1468         ret = get_unused_fd_flags(O_CLOEXEC);
1469         if (ret < 0) {
1470                 device->ops->release(device->device_data);
1471                 vfio_device_put(device);
1472                 return ret;
1473         }
1474
1475         filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1476                                    device, O_RDWR);
1477         if (IS_ERR(filep)) {
1478                 put_unused_fd(ret);
1479                 ret = PTR_ERR(filep);
1480                 device->ops->release(device->device_data);
1481                 vfio_device_put(device);
1482                 return ret;
1483         }
1484
1485         /*
1486          * TODO: add an anon_inode interface to do this.
1487          * Appears to be missing by lack of need rather than
1488          * explicitly prevented.  Now there's need.
1489          */
1490         filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1491
1492         atomic_inc(&group->container_users);
1493
1494         fd_install(ret, filep);
1495
1496         if (group->noiommu)
1497                 dev_warn(device->dev, "vfio-noiommu device opened by user "
1498                          "(%s:%d)\n", current->comm, task_pid_nr(current));
1499
1500         return ret;
1501 }
1502
1503 static long vfio_group_fops_unl_ioctl(struct file *filep,
1504                                       unsigned int cmd, unsigned long arg)
1505 {
1506         struct vfio_group *group = filep->private_data;
1507         long ret = -ENOTTY;
1508
1509         switch (cmd) {
1510         case VFIO_GROUP_GET_STATUS:
1511         {
1512                 struct vfio_group_status status;
1513                 unsigned long minsz;
1514
1515                 minsz = offsetofend(struct vfio_group_status, flags);
1516
1517                 if (copy_from_user(&status, (void __user *)arg, minsz))
1518                         return -EFAULT;
1519
1520                 if (status.argsz < minsz)
1521                         return -EINVAL;
1522
1523                 status.flags = 0;
1524
1525                 if (vfio_group_viable(group))
1526                         status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1527
1528                 if (group->container)
1529                         status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1530
1531                 if (copy_to_user((void __user *)arg, &status, minsz))
1532                         return -EFAULT;
1533
1534                 ret = 0;
1535                 break;
1536         }
1537         case VFIO_GROUP_SET_CONTAINER:
1538         {
1539                 int fd;
1540
1541                 if (get_user(fd, (int __user *)arg))
1542                         return -EFAULT;
1543
1544                 if (fd < 0)
1545                         return -EINVAL;
1546
1547                 ret = vfio_group_set_container(group, fd);
1548                 break;
1549         }
1550         case VFIO_GROUP_UNSET_CONTAINER:
1551                 ret = vfio_group_unset_container(group);
1552                 break;
1553         case VFIO_GROUP_GET_DEVICE_FD:
1554         {
1555                 char *buf;
1556
1557                 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1558                 if (IS_ERR(buf))
1559                         return PTR_ERR(buf);
1560
1561                 ret = vfio_group_get_device_fd(group, buf);
1562                 kfree(buf);
1563                 break;
1564         }
1565         }
1566
1567         return ret;
1568 }
1569
1570 #ifdef CONFIG_COMPAT
1571 static long vfio_group_fops_compat_ioctl(struct file *filep,
1572                                          unsigned int cmd, unsigned long arg)
1573 {
1574         arg = (unsigned long)compat_ptr(arg);
1575         return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1576 }
1577 #endif  /* CONFIG_COMPAT */
1578
1579 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1580 {
1581         struct vfio_group *group;
1582         int opened;
1583
1584         group = vfio_group_get_from_minor(iminor(inode));
1585         if (!group)
1586                 return -ENODEV;
1587
1588         if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1589                 vfio_group_put(group);
1590                 return -EPERM;
1591         }
1592
1593         /* Do we need multiple instances of the group open?  Seems not. */
1594         opened = atomic_cmpxchg(&group->opened, 0, 1);
1595         if (opened) {
1596                 vfio_group_put(group);
1597                 return -EBUSY;
1598         }
1599
1600         /* Is something still in use from a previous open? */
1601         if (group->container) {
1602                 atomic_dec(&group->opened);
1603                 vfio_group_put(group);
1604                 return -EBUSY;
1605         }
1606
1607         /* Warn if previous user didn't cleanup and re-init to drop them */
1608         if (WARN_ON(group->notifier.head))
1609                 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1610
1611         filep->private_data = group;
1612
1613         return 0;
1614 }
1615
1616 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1617 {
1618         struct vfio_group *group = filep->private_data;
1619
1620         filep->private_data = NULL;
1621
1622         vfio_group_try_dissolve_container(group);
1623
1624         atomic_dec(&group->opened);
1625
1626         vfio_group_put(group);
1627
1628         return 0;
1629 }
1630
1631 static const struct file_operations vfio_group_fops = {
1632         .owner          = THIS_MODULE,
1633         .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1634 #ifdef CONFIG_COMPAT
1635         .compat_ioctl   = vfio_group_fops_compat_ioctl,
1636 #endif
1637         .open           = vfio_group_fops_open,
1638         .release        = vfio_group_fops_release,
1639 };
1640
1641 /**
1642  * VFIO Device fd
1643  */
1644 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1645 {
1646         struct vfio_device *device = filep->private_data;
1647
1648         device->ops->release(device->device_data);
1649
1650         vfio_group_try_dissolve_container(device->group);
1651
1652         vfio_device_put(device);
1653
1654         return 0;
1655 }
1656
1657 static long vfio_device_fops_unl_ioctl(struct file *filep,
1658                                        unsigned int cmd, unsigned long arg)
1659 {
1660         struct vfio_device *device = filep->private_data;
1661
1662         if (unlikely(!device->ops->ioctl))
1663                 return -EINVAL;
1664
1665         return device->ops->ioctl(device->device_data, cmd, arg);
1666 }
1667
1668 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1669                                      size_t count, loff_t *ppos)
1670 {
1671         struct vfio_device *device = filep->private_data;
1672
1673         if (unlikely(!device->ops->read))
1674                 return -EINVAL;
1675
1676         return device->ops->read(device->device_data, buf, count, ppos);
1677 }
1678
1679 static ssize_t vfio_device_fops_write(struct file *filep,
1680                                       const char __user *buf,
1681                                       size_t count, loff_t *ppos)
1682 {
1683         struct vfio_device *device = filep->private_data;
1684
1685         if (unlikely(!device->ops->write))
1686                 return -EINVAL;
1687
1688         return device->ops->write(device->device_data, buf, count, ppos);
1689 }
1690
1691 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1692 {
1693         struct vfio_device *device = filep->private_data;
1694
1695         if (unlikely(!device->ops->mmap))
1696                 return -EINVAL;
1697
1698         return device->ops->mmap(device->device_data, vma);
1699 }
1700
1701 #ifdef CONFIG_COMPAT
1702 static long vfio_device_fops_compat_ioctl(struct file *filep,
1703                                           unsigned int cmd, unsigned long arg)
1704 {
1705         arg = (unsigned long)compat_ptr(arg);
1706         return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1707 }
1708 #endif  /* CONFIG_COMPAT */
1709
1710 static const struct file_operations vfio_device_fops = {
1711         .owner          = THIS_MODULE,
1712         .release        = vfio_device_fops_release,
1713         .read           = vfio_device_fops_read,
1714         .write          = vfio_device_fops_write,
1715         .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1716 #ifdef CONFIG_COMPAT
1717         .compat_ioctl   = vfio_device_fops_compat_ioctl,
1718 #endif
1719         .mmap           = vfio_device_fops_mmap,
1720 };
1721
1722 /**
1723  * External user API, exported by symbols to be linked dynamically.
1724  *
1725  * The protocol includes:
1726  *  1. do normal VFIO init operation:
1727  *      - opening a new container;
1728  *      - attaching group(s) to it;
1729  *      - setting an IOMMU driver for a container.
1730  * When IOMMU is set for a container, all groups in it are
1731  * considered ready to use by an external user.
1732  *
1733  * 2. User space passes a group fd to an external user.
1734  * The external user calls vfio_group_get_external_user()
1735  * to verify that:
1736  *      - the group is initialized;
1737  *      - IOMMU is set for it.
1738  * If both checks passed, vfio_group_get_external_user()
1739  * increments the container user counter to prevent
1740  * the VFIO group from disposal before KVM exits.
1741  *
1742  * 3. The external user calls vfio_external_user_iommu_id()
1743  * to know an IOMMU ID.
1744  *
1745  * 4. When the external KVM finishes, it calls
1746  * vfio_group_put_external_user() to release the VFIO group.
1747  * This call decrements the container user counter.
1748  */
1749 struct vfio_group *vfio_group_get_external_user(struct file *filep)
1750 {
1751         struct vfio_group *group = filep->private_data;
1752         int ret;
1753
1754         if (filep->f_op != &vfio_group_fops)
1755                 return ERR_PTR(-EINVAL);
1756
1757         ret = vfio_group_add_container_user(group);
1758         if (ret)
1759                 return ERR_PTR(ret);
1760
1761         vfio_group_get(group);
1762
1763         return group;
1764 }
1765 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1766
1767 void vfio_group_put_external_user(struct vfio_group *group)
1768 {
1769         vfio_group_try_dissolve_container(group);
1770         vfio_group_put(group);
1771 }
1772 EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1773
1774 bool vfio_external_group_match_file(struct vfio_group *test_group,
1775                                     struct file *filep)
1776 {
1777         struct vfio_group *group = filep->private_data;
1778
1779         return (filep->f_op == &vfio_group_fops) && (group == test_group);
1780 }
1781 EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1782
1783 int vfio_external_user_iommu_id(struct vfio_group *group)
1784 {
1785         return iommu_group_id(group->iommu_group);
1786 }
1787 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1788
1789 long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1790 {
1791         return vfio_ioctl_check_extension(group->container, arg);
1792 }
1793 EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1794
1795 /**
1796  * Sub-module support
1797  */
1798 /*
1799  * Helper for managing a buffer of info chain capabilities, allocate or
1800  * reallocate a buffer with additional @size, filling in @id and @version
1801  * of the capability.  A pointer to the new capability is returned.
1802  *
1803  * NB. The chain is based at the head of the buffer, so new entries are
1804  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1805  * next offsets prior to copying to the user buffer.
1806  */
1807 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1808                                                size_t size, u16 id, u16 version)
1809 {
1810         void *buf;
1811         struct vfio_info_cap_header *header, *tmp;
1812
1813         buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1814         if (!buf) {
1815                 kfree(caps->buf);
1816                 caps->buf = NULL;
1817                 caps->size = 0;
1818                 return ERR_PTR(-ENOMEM);
1819         }
1820
1821         caps->buf = buf;
1822         header = buf + caps->size;
1823
1824         /* Eventually copied to user buffer, zero */
1825         memset(header, 0, size);
1826
1827         header->id = id;
1828         header->version = version;
1829
1830         /* Add to the end of the capability chain */
1831         for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1832                 ; /* nothing */
1833
1834         tmp->next = caps->size;
1835         caps->size += size;
1836
1837         return header;
1838 }
1839 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1840
1841 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1842 {
1843         struct vfio_info_cap_header *tmp;
1844         void *buf = (void *)caps->buf;
1845
1846         for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1847                 tmp->next += offset;
1848 }
1849 EXPORT_SYMBOL(vfio_info_cap_shift);
1850
1851 static int sparse_mmap_cap(struct vfio_info_cap *caps, void *cap_type)
1852 {
1853         struct vfio_info_cap_header *header;
1854         struct vfio_region_info_cap_sparse_mmap *sparse_cap, *sparse = cap_type;
1855         size_t size;
1856
1857         size = sizeof(*sparse) + sparse->nr_areas *  sizeof(*sparse->areas);
1858         header = vfio_info_cap_add(caps, size,
1859                                    VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1);
1860         if (IS_ERR(header))
1861                 return PTR_ERR(header);
1862
1863         sparse_cap = container_of(header,
1864                         struct vfio_region_info_cap_sparse_mmap, header);
1865         sparse_cap->nr_areas = sparse->nr_areas;
1866         memcpy(sparse_cap->areas, sparse->areas,
1867                sparse->nr_areas * sizeof(*sparse->areas));
1868         return 0;
1869 }
1870
1871 static int region_type_cap(struct vfio_info_cap *caps, void *cap_type)
1872 {
1873         struct vfio_info_cap_header *header;
1874         struct vfio_region_info_cap_type *type_cap, *cap = cap_type;
1875
1876         header = vfio_info_cap_add(caps, sizeof(*cap),
1877                                    VFIO_REGION_INFO_CAP_TYPE, 1);
1878         if (IS_ERR(header))
1879                 return PTR_ERR(header);
1880
1881         type_cap = container_of(header, struct vfio_region_info_cap_type,
1882                                 header);
1883         type_cap->type = cap->type;
1884         type_cap->subtype = cap->subtype;
1885         return 0;
1886 }
1887
1888 int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id,
1889                              void *cap_type)
1890 {
1891         int ret = -EINVAL;
1892
1893         if (!cap_type)
1894                 return 0;
1895
1896         switch (cap_type_id) {
1897         case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
1898                 ret = sparse_mmap_cap(caps, cap_type);
1899                 break;
1900
1901         case VFIO_REGION_INFO_CAP_TYPE:
1902                 ret = region_type_cap(caps, cap_type);
1903                 break;
1904         }
1905
1906         return ret;
1907 }
1908 EXPORT_SYMBOL(vfio_info_add_capability);
1909
1910 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1911                                        int max_irq_type, size_t *data_size)
1912 {
1913         unsigned long minsz;
1914         size_t size;
1915
1916         minsz = offsetofend(struct vfio_irq_set, count);
1917
1918         if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1919             (hdr->count >= (U32_MAX - hdr->start)) ||
1920             (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1921                                 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1922                 return -EINVAL;
1923
1924         if (data_size)
1925                 *data_size = 0;
1926
1927         if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1928                 return -EINVAL;
1929
1930         switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1931         case VFIO_IRQ_SET_DATA_NONE:
1932                 size = 0;
1933                 break;
1934         case VFIO_IRQ_SET_DATA_BOOL:
1935                 size = sizeof(uint8_t);
1936                 break;
1937         case VFIO_IRQ_SET_DATA_EVENTFD:
1938                 size = sizeof(int32_t);
1939                 break;
1940         default:
1941                 return -EINVAL;
1942         }
1943
1944         if (size) {
1945                 if (hdr->argsz - minsz < hdr->count * size)
1946                         return -EINVAL;
1947
1948                 if (!data_size)
1949                         return -EINVAL;
1950
1951                 *data_size = hdr->count * size;
1952         }
1953
1954         return 0;
1955 }
1956 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1957
1958 /*
1959  * Pin a set of guest PFNs and return their associated host PFNs for local
1960  * domain only.
1961  * @dev [in]     : device
1962  * @user_pfn [in]: array of user/guest PFNs to be pinned.
1963  * @npage [in]   : count of elements in user_pfn array.  This count should not
1964  *                 be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1965  * @prot [in]    : protection flags
1966  * @phys_pfn[out]: array of host PFNs
1967  * Return error or number of pages pinned.
1968  */
1969 int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1970                    int prot, unsigned long *phys_pfn)
1971 {
1972         struct vfio_container *container;
1973         struct vfio_group *group;
1974         struct vfio_iommu_driver *driver;
1975         int ret;
1976
1977         if (!dev || !user_pfn || !phys_pfn || !npage)
1978                 return -EINVAL;
1979
1980         if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1981                 return -E2BIG;
1982
1983         group = vfio_group_get_from_dev(dev);
1984         if (!group)
1985                 return -ENODEV;
1986
1987         ret = vfio_group_add_container_user(group);
1988         if (ret)
1989                 goto err_pin_pages;
1990
1991         container = group->container;
1992         driver = container->iommu_driver;
1993         if (likely(driver && driver->ops->pin_pages))
1994                 ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
1995                                              npage, prot, phys_pfn);
1996         else
1997                 ret = -ENOTTY;
1998
1999         vfio_group_try_dissolve_container(group);
2000
2001 err_pin_pages:
2002         vfio_group_put(group);
2003         return ret;
2004 }
2005 EXPORT_SYMBOL(vfio_pin_pages);
2006
2007 /*
2008  * Unpin set of host PFNs for local domain only.
2009  * @dev [in]     : device
2010  * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
2011  *                 PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2012  * @npage [in]   : count of elements in user_pfn array.  This count should not
2013  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
2014  * Return error or number of pages unpinned.
2015  */
2016 int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
2017 {
2018         struct vfio_container *container;
2019         struct vfio_group *group;
2020         struct vfio_iommu_driver *driver;
2021         int ret;
2022
2023         if (!dev || !user_pfn || !npage)
2024                 return -EINVAL;
2025
2026         if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
2027                 return -E2BIG;
2028
2029         group = vfio_group_get_from_dev(dev);
2030         if (!group)
2031                 return -ENODEV;
2032
2033         ret = vfio_group_add_container_user(group);
2034         if (ret)
2035                 goto err_unpin_pages;
2036
2037         container = group->container;
2038         driver = container->iommu_driver;
2039         if (likely(driver && driver->ops->unpin_pages))
2040                 ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
2041                                                npage);
2042         else
2043                 ret = -ENOTTY;
2044
2045         vfio_group_try_dissolve_container(group);
2046
2047 err_unpin_pages:
2048         vfio_group_put(group);
2049         return ret;
2050 }
2051 EXPORT_SYMBOL(vfio_unpin_pages);
2052
2053 static int vfio_register_iommu_notifier(struct vfio_group *group,
2054                                         unsigned long *events,
2055                                         struct notifier_block *nb)
2056 {
2057         struct vfio_container *container;
2058         struct vfio_iommu_driver *driver;
2059         int ret;
2060
2061         ret = vfio_group_add_container_user(group);
2062         if (ret)
2063                 return -EINVAL;
2064
2065         container = group->container;
2066         driver = container->iommu_driver;
2067         if (likely(driver && driver->ops->register_notifier))
2068                 ret = driver->ops->register_notifier(container->iommu_data,
2069                                                      events, nb);
2070         else
2071                 ret = -ENOTTY;
2072
2073         vfio_group_try_dissolve_container(group);
2074
2075         return ret;
2076 }
2077
2078 static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2079                                           struct notifier_block *nb)
2080 {
2081         struct vfio_container *container;
2082         struct vfio_iommu_driver *driver;
2083         int ret;
2084
2085         ret = vfio_group_add_container_user(group);
2086         if (ret)
2087                 return -EINVAL;
2088
2089         container = group->container;
2090         driver = container->iommu_driver;
2091         if (likely(driver && driver->ops->unregister_notifier))
2092                 ret = driver->ops->unregister_notifier(container->iommu_data,
2093                                                        nb);
2094         else
2095                 ret = -ENOTTY;
2096
2097         vfio_group_try_dissolve_container(group);
2098
2099         return ret;
2100 }
2101
2102 void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2103 {
2104         group->kvm = kvm;
2105         blocking_notifier_call_chain(&group->notifier,
2106                                 VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2107 }
2108 EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2109
2110 static int vfio_register_group_notifier(struct vfio_group *group,
2111                                         unsigned long *events,
2112                                         struct notifier_block *nb)
2113 {
2114         int ret;
2115         bool set_kvm = false;
2116
2117         if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2118                 set_kvm = true;
2119
2120         /* clear known events */
2121         *events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2122
2123         /* refuse to continue if still events remaining */
2124         if (*events)
2125                 return -EINVAL;
2126
2127         ret = vfio_group_add_container_user(group);
2128         if (ret)
2129                 return -EINVAL;
2130
2131         ret = blocking_notifier_chain_register(&group->notifier, nb);
2132
2133         /*
2134          * The attaching of kvm and vfio_group might already happen, so
2135          * here we replay once upon registration.
2136          */
2137         if (!ret && set_kvm && group->kvm)
2138                 blocking_notifier_call_chain(&group->notifier,
2139                                         VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2140
2141         vfio_group_try_dissolve_container(group);
2142
2143         return ret;
2144 }
2145
2146 static int vfio_unregister_group_notifier(struct vfio_group *group,
2147                                          struct notifier_block *nb)
2148 {
2149         int ret;
2150
2151         ret = vfio_group_add_container_user(group);
2152         if (ret)
2153                 return -EINVAL;
2154
2155         ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2156
2157         vfio_group_try_dissolve_container(group);
2158
2159         return ret;
2160 }
2161
2162 int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2163                            unsigned long *events, struct notifier_block *nb)
2164 {
2165         struct vfio_group *group;
2166         int ret;
2167
2168         if (!dev || !nb || !events || (*events == 0))
2169                 return -EINVAL;
2170
2171         group = vfio_group_get_from_dev(dev);
2172         if (!group)
2173                 return -ENODEV;
2174
2175         switch (type) {
2176         case VFIO_IOMMU_NOTIFY:
2177                 ret = vfio_register_iommu_notifier(group, events, nb);
2178                 break;
2179         case VFIO_GROUP_NOTIFY:
2180                 ret = vfio_register_group_notifier(group, events, nb);
2181                 break;
2182         default:
2183                 ret = -EINVAL;
2184         }
2185
2186         vfio_group_put(group);
2187         return ret;
2188 }
2189 EXPORT_SYMBOL(vfio_register_notifier);
2190
2191 int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2192                              struct notifier_block *nb)
2193 {
2194         struct vfio_group *group;
2195         int ret;
2196
2197         if (!dev || !nb)
2198                 return -EINVAL;
2199
2200         group = vfio_group_get_from_dev(dev);
2201         if (!group)
2202                 return -ENODEV;
2203
2204         switch (type) {
2205         case VFIO_IOMMU_NOTIFY:
2206                 ret = vfio_unregister_iommu_notifier(group, nb);
2207                 break;
2208         case VFIO_GROUP_NOTIFY:
2209                 ret = vfio_unregister_group_notifier(group, nb);
2210                 break;
2211         default:
2212                 ret = -EINVAL;
2213         }
2214
2215         vfio_group_put(group);
2216         return ret;
2217 }
2218 EXPORT_SYMBOL(vfio_unregister_notifier);
2219
2220 /**
2221  * Module/class support
2222  */
2223 static char *vfio_devnode(struct device *dev, umode_t *mode)
2224 {
2225         return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2226 }
2227
2228 static struct miscdevice vfio_dev = {
2229         .minor = VFIO_MINOR,
2230         .name = "vfio",
2231         .fops = &vfio_fops,
2232         .nodename = "vfio/vfio",
2233         .mode = S_IRUGO | S_IWUGO,
2234 };
2235
2236 static int __init vfio_init(void)
2237 {
2238         int ret;
2239
2240         idr_init(&vfio.group_idr);
2241         mutex_init(&vfio.group_lock);
2242         mutex_init(&vfio.iommu_drivers_lock);
2243         INIT_LIST_HEAD(&vfio.group_list);
2244         INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2245         init_waitqueue_head(&vfio.release_q);
2246
2247         ret = misc_register(&vfio_dev);
2248         if (ret) {
2249                 pr_err("vfio: misc device register failed\n");
2250                 return ret;
2251         }
2252
2253         /* /dev/vfio/$GROUP */
2254         vfio.class = class_create(THIS_MODULE, "vfio");
2255         if (IS_ERR(vfio.class)) {
2256                 ret = PTR_ERR(vfio.class);
2257                 goto err_class;
2258         }
2259
2260         vfio.class->devnode = vfio_devnode;
2261
2262         ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK, "vfio");
2263         if (ret)
2264                 goto err_alloc_chrdev;
2265
2266         cdev_init(&vfio.group_cdev, &vfio_group_fops);
2267         ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK);
2268         if (ret)
2269                 goto err_cdev_add;
2270
2271         pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2272
2273 #ifdef CONFIG_VFIO_NOIOMMU
2274         vfio_register_iommu_driver(&vfio_noiommu_ops);
2275 #endif
2276         return 0;
2277
2278 err_cdev_add:
2279         unregister_chrdev_region(vfio.group_devt, MINORMASK);
2280 err_alloc_chrdev:
2281         class_destroy(vfio.class);
2282         vfio.class = NULL;
2283 err_class:
2284         misc_deregister(&vfio_dev);
2285         return ret;
2286 }
2287
2288 static void __exit vfio_cleanup(void)
2289 {
2290         WARN_ON(!list_empty(&vfio.group_list));
2291
2292 #ifdef CONFIG_VFIO_NOIOMMU
2293         vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2294 #endif
2295         idr_destroy(&vfio.group_idr);
2296         cdev_del(&vfio.group_cdev);
2297         unregister_chrdev_region(vfio.group_devt, MINORMASK);
2298         class_destroy(vfio.class);
2299         vfio.class = NULL;
2300         misc_deregister(&vfio_dev);
2301 }
2302
2303 module_init(vfio_init);
2304 module_exit(vfio_cleanup);
2305
2306 MODULE_VERSION(DRIVER_VERSION);
2307 MODULE_LICENSE("GPL v2");
2308 MODULE_AUTHOR(DRIVER_AUTHOR);
2309 MODULE_DESCRIPTION(DRIVER_DESC);
2310 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2311 MODULE_ALIAS("devname:vfio/vfio");
2312 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");