drivers/vfio/vfio_iommu_type1.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * VFIO: IOMMU DMA mapping support for Type1 IOMMU
   4  *
   5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   6  *     Author: Alex Williamson <alex.williamson@redhat.com>
   7  *
   8  * Derived from original vfio:
   9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  10  * Author: Tom Lyon, pugs@cisco.com
  11  *
  12  * We arbitrarily define a Type1 IOMMU as one matching the below code.
  13  * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
  14  * VT-d, but that makes it harder to re-use as theoretically anyone
  15  * implementing a similar IOMMU could make use of this.  We expect the
  16  * IOMMU to support the IOMMU API and have few to no restrictions around
  17  * the IOVA range that can be mapped.  The Type1 IOMMU is currently
  18  * optimized for relatively static mappings of a userspace process with
  19  * userspace pages pinned into memory.  We also assume devices and IOMMU
  20  * domains are PCI based as the IOMMU API is still centered around a
  21  * device/bus interface rather than a group interface.
  22  */
  23
  24 #include <linux/compat.h>
  25 #include <linux/device.h>
  26 #include <linux/fs.h>
  27 #include <linux/highmem.h>
  28 #include <linux/iommu.h>
  29 #include <linux/module.h>
  30 #include <linux/mm.h>
  31 #include <linux/kthread.h>
  32 #include <linux/rbtree.h>
  33 #include <linux/sched/signal.h>
  34 #include <linux/sched/mm.h>
  35 #include <linux/slab.h>
  36 #include <linux/uaccess.h>
  37 #include <linux/vfio.h>
  38 #include <linux/workqueue.h>
  39 #include <linux/notifier.h>
  40 #include "vfio.h"
  41
  42 #define DRIVER_VERSION  "0.2"
  43 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  44 #define DRIVER_DESC     "Type1 IOMMU driver for VFIO"
  45
  46 static bool allow_unsafe_interrupts;
  47 module_param_named(allow_unsafe_interrupts,
  48                    allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
  49 MODULE_PARM_DESC(allow_unsafe_interrupts,
  50                  "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
  51
  52 static bool disable_hugepages;
  53 module_param_named(disable_hugepages,
  54                    disable_hugepages, bool, S_IRUGO | S_IWUSR);
  55 MODULE_PARM_DESC(disable_hugepages,
  56                  "Disable VFIO IOMMU support for IOMMU hugepages.");
  57
  58 static unsigned int dma_entry_limit __read_mostly = U16_MAX;
  59 module_param_named(dma_entry_limit, dma_entry_limit, uint, 0644);
  60 MODULE_PARM_DESC(dma_entry_limit,
  61                  "Maximum number of user DMA mappings per container (65535).");
  62
  63 struct vfio_iommu {
  64         struct list_head        domain_list;
  65         struct list_head        iova_list;
  66         struct mutex            lock;
  67         struct rb_root          dma_list;
  68         struct list_head        device_list;
  69         struct mutex            device_list_lock;
  70         unsigned int            dma_avail;
  71         unsigned int            vaddr_invalid_count;
  72         uint64_t                pgsize_bitmap;
  73         uint64_t                num_non_pinned_groups;
  74         bool                    v2;
  75         bool                    nesting;
  76         bool                    dirty_page_tracking;
  77         struct list_head        emulated_iommu_groups;
  78 };
  79
  80 struct vfio_domain {
  81         struct iommu_domain     *domain;
  82         struct list_head        next;
  83         struct list_head        group_list;
  84         bool                    fgsp : 1;       /* Fine-grained super pages */
  85         bool                    enforce_cache_coherency : 1;
  86 };
  87
  88 struct vfio_dma {
  89         struct rb_node          node;
  90         dma_addr_t              iova;           /* Device address */
  91         unsigned long           vaddr;          /* Process virtual addr */
  92         size_t                  size;           /* Map size (bytes) */
  93         int                     prot;           /* IOMMU_READ/WRITE */
  94         bool                    iommu_mapped;
  95         bool                    lock_cap;       /* capable(CAP_IPC_LOCK) */
  96         bool                    vaddr_invalid;
  97         struct task_struct      *task;
  98         struct rb_root          pfn_list;       /* Ex-user pinned pfn list */
  99         unsigned long           *bitmap;
 100         struct mm_struct        *mm;
 101         size_t                  locked_vm;
 102 };
 103
 104 struct vfio_batch {
 105         struct page             **pages;        /* for pin_user_pages_remote */
 106         struct page             *fallback_page; /* if pages alloc fails */
 107         int                     capacity;       /* length of pages array */
 108         int                     size;           /* of batch currently */
 109         int                     offset;         /* of next entry in pages */
 110 };
 111
 112 struct vfio_iommu_group {
 113         struct iommu_group      *iommu_group;
 114         struct list_head        next;
 115         bool                    pinned_page_dirty_scope;
 116 };
 117
 118 struct vfio_iova {
 119         struct list_head        list;
 120         dma_addr_t              start;
 121         dma_addr_t              end;
 122 };
 123
 124 /*
 125  * Guest RAM pinning working set or DMA target
 126  */
 127 struct vfio_pfn {
 128         struct rb_node          node;
 129         dma_addr_t              iova;           /* Device address */
 130         unsigned long           pfn;            /* Host pfn */
 131         unsigned int            ref_count;
 132 };
 133
 134 struct vfio_regions {
 135         struct list_head list;
 136         dma_addr_t iova;
 137         phys_addr_t phys;
 138         size_t len;
 139 };
 140
 141 #define DIRTY_BITMAP_BYTES(n)   (ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
 142
 143 /*
 144  * Input argument of number of bits to bitmap_set() is unsigned integer, which
 145  * further casts to signed integer for unaligned multi-bit operation,
 146  * __bitmap_set().
 147  * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
 148  * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
 149  * system.
 150  */
 151 #define DIRTY_BITMAP_PAGES_MAX   ((u64)INT_MAX)
 152 #define DIRTY_BITMAP_SIZE_MAX    DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
 153
 154 static int put_pfn(unsigned long pfn, int prot);
 155
 156 static struct vfio_iommu_group*
 157 vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
 158                             struct iommu_group *iommu_group);
 159
 160 /*
 161  * This code handles mapping and unmapping of user data buffers
 162  * into DMA'ble space using the IOMMU
 163  */
 164
 165 static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
 166                                       dma_addr_t start, size_t size)
 167 {
 168         struct rb_node *node = iommu->dma_list.rb_node;
 169
 170         while (node) {
 171                 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
 172
 173                 if (start + size <= dma->iova)
 174                         node = node->rb_left;
 175                 else if (start >= dma->iova + dma->size)
 176                         node = node->rb_right;
 177                 else
 178                         return dma;
 179         }
 180
 181         return NULL;
 182 }
 183
 184 static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu,
 185                                                 dma_addr_t start, u64 size)
 186 {
 187         struct rb_node *res = NULL;
 188         struct rb_node *node = iommu->dma_list.rb_node;
 189         struct vfio_dma *dma_res = NULL;
 190
 191         while (node) {
 192                 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
 193
 194                 if (start < dma->iova + dma->size) {
 195                         res = node;
 196                         dma_res = dma;
 197                         if (start >= dma->iova)
 198                                 break;
 199                         node = node->rb_left;
 200                 } else {
 201                         node = node->rb_right;
 202                 }
 203         }
 204         if (res && size && dma_res->iova >= start + size)
 205                 res = NULL;
 206         return res;
 207 }
 208
 209 static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
 210 {
 211         struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
 212         struct vfio_dma *dma;
 213
 214         while (*link) {
 215                 parent = *link;
 216                 dma = rb_entry(parent, struct vfio_dma, node);
 217
 218                 if (new->iova + new->size <= dma->iova)
 219                         link = &(*link)->rb_left;
 220                 else
 221                         link = &(*link)->rb_right;
 222         }
 223
 224         rb_link_node(&new->node, parent, link);
 225         rb_insert_color(&new->node, &iommu->dma_list);
 226 }
 227
 228 static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
 229 {
 230         rb_erase(&old->node, &iommu->dma_list);
 231 }
 232
 233
 234 static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
 235 {
 236         uint64_t npages = dma->size / pgsize;
 237
 238         if (npages > DIRTY_BITMAP_PAGES_MAX)
 239                 return -EINVAL;
 240
 241         /*
 242          * Allocate extra 64 bits that are used to calculate shift required for
 243          * bitmap_shift_left() to manipulate and club unaligned number of pages
 244          * in adjacent vfio_dma ranges.
 245          */
 246         dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages) + sizeof(u64),
 247                                GFP_KERNEL);
 248         if (!dma->bitmap)
 249                 return -ENOMEM;
 250
 251         return 0;
 252 }
 253
 254 static void vfio_dma_bitmap_free(struct vfio_dma *dma)
 255 {
 256         kvfree(dma->bitmap);
 257         dma->bitmap = NULL;
 258 }
 259
 260 static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
 261 {
 262         struct rb_node *p;
 263         unsigned long pgshift = __ffs(pgsize);
 264
 265         for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
 266                 struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
 267
 268                 bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) >> pgshift, 1);
 269         }
 270 }
 271
 272 static void vfio_iommu_populate_bitmap_full(struct vfio_iommu *iommu)
 273 {
 274         struct rb_node *n;
 275         unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
 276
 277         for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
 278                 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
 279
 280                 bitmap_set(dma->bitmap, 0, dma->size >> pgshift);
 281         }
 282 }
 283
 284 static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
 285 {
 286         struct rb_node *n;
 287
 288         for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
 289                 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
 290                 int ret;
 291
 292                 ret = vfio_dma_bitmap_alloc(dma, pgsize);
 293                 if (ret) {
 294                         struct rb_node *p;
 295
 296                         for (p = rb_prev(n); p; p = rb_prev(p)) {
 297                                 struct vfio_dma *dma = rb_entry(n,
 298                                                         struct vfio_dma, node);
 299
 300                                 vfio_dma_bitmap_free(dma);
 301                         }
 302                         return ret;
 303                 }
 304                 vfio_dma_populate_bitmap(dma, pgsize);
 305         }
 306         return 0;
 307 }
 308
 309 static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
 310 {
 311         struct rb_node *n;
 312
 313         for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
 314                 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
 315
 316                 vfio_dma_bitmap_free(dma);
 317         }
 318 }
 319
 320 /*
 321  * Helper Functions for host iova-pfn list
 322  */
 323 static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova)
 324 {
 325         struct vfio_pfn *vpfn;
 326         struct rb_node *node = dma->pfn_list.rb_node;
 327
 328         while (node) {
 329                 vpfn = rb_entry(node, struct vfio_pfn, node);
 330
 331                 if (iova < vpfn->iova)
 332                         node = node->rb_left;
 333                 else if (iova > vpfn->iova)
 334                         node = node->rb_right;
 335                 else
 336                         return vpfn;
 337         }
 338         return NULL;
 339 }
 340
 341 static void vfio_link_pfn(struct vfio_dma *dma,
 342                           struct vfio_pfn *new)
 343 {
 344         struct rb_node **link, *parent = NULL;
 345         struct vfio_pfn *vpfn;
 346
 347         link = &dma->pfn_list.rb_node;
 348         while (*link) {
 349                 parent = *link;
 350                 vpfn = rb_entry(parent, struct vfio_pfn, node);
 351
 352                 if (new->iova < vpfn->iova)
 353                         link = &(*link)->rb_left;
 354                 else
 355                         link = &(*link)->rb_right;
 356         }
 357
 358         rb_link_node(&new->node, parent, link);
 359         rb_insert_color(&new->node, &dma->pfn_list);
 360 }
 361
 362 static void vfio_unlink_pfn(struct vfio_dma *dma, struct vfio_pfn *old)
 363 {
 364         rb_erase(&old->node, &dma->pfn_list);
 365 }
 366
 367 static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova,
 368                                 unsigned long pfn)
 369 {
 370         struct vfio_pfn *vpfn;
 371
 372         vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
 373         if (!vpfn)
 374                 return -ENOMEM;
 375
 376         vpfn->iova = iova;
 377         vpfn->pfn = pfn;
 378         vpfn->ref_count = 1;
 379         vfio_link_pfn(dma, vpfn);
 380         return 0;
 381 }
 382
 383 static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
 384                                       struct vfio_pfn *vpfn)
 385 {
 386         vfio_unlink_pfn(dma, vpfn);
 387         kfree(vpfn);
 388 }
 389
 390 static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
 391                                                unsigned long iova)
 392 {
 393         struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
 394
 395         if (vpfn)
 396                 vpfn->ref_count++;
 397         return vpfn;
 398 }
 399
 400 static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
 401 {
 402         int ret = 0;
 403
 404         vpfn->ref_count--;
 405         if (!vpfn->ref_count) {
 406                 ret = put_pfn(vpfn->pfn, dma->prot);
 407                 vfio_remove_from_pfn_list(dma, vpfn);
 408         }
 409         return ret;
 410 }
 411
 412 static int mm_lock_acct(struct task_struct *task, struct mm_struct *mm,
 413                         bool lock_cap, long npage)
 414 {
 415         int ret = mmap_write_lock_killable(mm);
 416
 417         if (ret)
 418                 return ret;
 419
 420         ret = __account_locked_vm(mm, abs(npage), npage > 0, task, lock_cap);
 421         mmap_write_unlock(mm);
 422         return ret;
 423 }
 424
 425 static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
 426 {
 427         struct mm_struct *mm;
 428         int ret;
 429
 430         if (!npage)
 431                 return 0;
 432
 433         mm = dma->mm;
 434         if (async && !mmget_not_zero(mm))
 435                 return -ESRCH; /* process exited */
 436
 437         ret = mm_lock_acct(dma->task, mm, dma->lock_cap, npage);
 438         if (!ret)
 439                 dma->locked_vm += npage;
 440
 441         if (async)
 442                 mmput(mm);
 443
 444         return ret;
 445 }
 446
 447 /*
 448  * Some mappings aren't backed by a struct page, for example an mmap'd
 449  * MMIO range for our own or another device.  These use a different
 450  * pfn conversion and shouldn't be tracked as locked pages.
 451  * For compound pages, any driver that sets the reserved bit in head
 452  * page needs to set the reserved bit in all subpages to be safe.
 453  */
 454 static bool is_invalid_reserved_pfn(unsigned long pfn)
 455 {
 456         if (pfn_valid(pfn))
 457                 return PageReserved(pfn_to_page(pfn));
 458
 459         return true;
 460 }
 461
 462 static int put_pfn(unsigned long pfn, int prot)
 463 {
 464         if (!is_invalid_reserved_pfn(pfn)) {
 465                 struct page *page = pfn_to_page(pfn);
 466
 467                 unpin_user_pages_dirty_lock(&page, 1, prot & IOMMU_WRITE);
 468                 return 1;
 469         }
 470         return 0;
 471 }
 472
 473 #define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *))
 474
 475 static void vfio_batch_init(struct vfio_batch *batch)
 476 {
 477         batch->size = 0;
 478         batch->offset = 0;
 479
 480         if (unlikely(disable_hugepages))
 481                 goto fallback;
 482
 483         batch->pages = (struct page **) __get_free_page(GFP_KERNEL);
 484         if (!batch->pages)
 485                 goto fallback;
 486
 487         batch->capacity = VFIO_BATCH_MAX_CAPACITY;
 488         return;
 489
 490 fallback:
 491         batch->pages = &batch->fallback_page;
 492         batch->capacity = 1;
 493 }
 494
 495 static void vfio_batch_unpin(struct vfio_batch *batch, struct vfio_dma *dma)
 496 {
 497         while (batch->size) {
 498                 unsigned long pfn = page_to_pfn(batch->pages[batch->offset]);
 499
 500                 put_pfn(pfn, dma->prot);
 501                 batch->offset++;
 502                 batch->size--;
 503         }
 504 }
 505
 506 static void vfio_batch_fini(struct vfio_batch *batch)
 507 {
 508         if (batch->capacity == VFIO_BATCH_MAX_CAPACITY)
 509                 free_page((unsigned long)batch->pages);
 510 }
 511
 512 static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
 513                             unsigned long vaddr, unsigned long *pfn,
 514                             bool write_fault)
 515 {
 516         pte_t *ptep;
 517         pte_t pte;
 518         spinlock_t *ptl;
 519         int ret;
 520
 521         ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl);
 522         if (ret) {
 523                 bool unlocked = false;
 524
 525                 ret = fixup_user_fault(mm, vaddr,
 526                                        FAULT_FLAG_REMOTE |
 527                                        (write_fault ?  FAULT_FLAG_WRITE : 0),
 528                                        &unlocked);
 529                 if (unlocked)
 530                         return -EAGAIN;
 531
 532                 if (ret)
 533                         return ret;
 534
 535                 ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl);
 536                 if (ret)
 537                         return ret;
 538         }
 539
 540         pte = ptep_get(ptep);
 541
 542         if (write_fault && !pte_write(pte))
 543                 ret = -EFAULT;
 544         else
 545                 *pfn = pte_pfn(pte);
 546
 547         pte_unmap_unlock(ptep, ptl);
 548         return ret;
 549 }
 550
 551 /*
 552  * Returns the positive number of pfns successfully obtained or a negative
 553  * error code.
 554  */
 555 static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
 556                           long npages, int prot, unsigned long *pfn,
 557                           struct page **pages)
 558 {
 559         struct vm_area_struct *vma;
 560         unsigned int flags = 0;
 561         int ret;
 562
 563         if (prot & IOMMU_WRITE)
 564                 flags |= FOLL_WRITE;
 565
 566         mmap_read_lock(mm);
 567         ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM,
 568                                     pages, NULL);
 569         if (ret > 0) {
 570                 int i;
 571
 572                 /*
 573                  * The zero page is always resident, we don't need to pin it
 574                  * and it falls into our invalid/reserved test so we don't
 575                  * unpin in put_pfn().  Unpin all zero pages in the batch here.
 576                  */
 577                 for (i = 0 ; i < ret; i++) {
 578                         if (unlikely(is_zero_pfn(page_to_pfn(pages[i]))))
 579                                 unpin_user_page(pages[i]);
 580                 }
 581
 582                 *pfn = page_to_pfn(pages[0]);
 583                 goto done;
 584         }
 585
 586         vaddr = untagged_addr_remote(mm, vaddr);
 587
 588 retry:
 589         vma = vma_lookup(mm, vaddr);
 590
 591         if (vma && vma->vm_flags & VM_PFNMAP) {
 592                 ret = follow_fault_pfn(vma, mm, vaddr, pfn, prot & IOMMU_WRITE);
 593                 if (ret == -EAGAIN)
 594                         goto retry;
 595
 596                 if (!ret) {
 597                         if (is_invalid_reserved_pfn(*pfn))
 598                                 ret = 1;
 599                         else
 600                                 ret = -EFAULT;
 601                 }
 602         }
 603 done:
 604         mmap_read_unlock(mm);
 605         return ret;
 606 }
 607
 608 /*
 609  * Attempt to pin pages.  We really don't want to track all the pfns and
 610  * the iommu can only map chunks of consecutive pfns anyway, so get the
 611  * first page and all consecutive pages with the same locking.
 612  */
 613 static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 614                                   long npage, unsigned long *pfn_base,
 615                                   unsigned long limit, struct vfio_batch *batch)
 616 {
 617         unsigned long pfn;
 618         struct mm_struct *mm = current->mm;
 619         long ret, pinned = 0, lock_acct = 0;
 620         bool rsvd;
 621         dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
 622
 623         /* This code path is only user initiated */
 624         if (!mm)
 625                 return -ENODEV;
 626
 627         if (batch->size) {
 628                 /* Leftover pages in batch from an earlier call. */
 629                 *pfn_base = page_to_pfn(batch->pages[batch->offset]);
 630                 pfn = *pfn_base;
 631                 rsvd = is_invalid_reserved_pfn(*pfn_base);
 632         } else {
 633                 *pfn_base = 0;
 634         }
 635
 636         while (npage) {
 637                 if (!batch->size) {
 638                         /* Empty batch, so refill it. */
 639                         long req_pages = min_t(long, npage, batch->capacity);
 640
 641                         ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot,
 642                                              &pfn, batch->pages);
 643                         if (ret < 0)
 644                                 goto unpin_out;
 645
 646                         batch->size = ret;
 647                         batch->offset = 0;
 648
 649                         if (!*pfn_base) {
 650                                 *pfn_base = pfn;
 651                                 rsvd = is_invalid_reserved_pfn(*pfn_base);
 652                         }
 653                 }
 654
 655                 /*
 656                  * pfn is preset for the first iteration of this inner loop and
 657                  * updated at the end to handle a VM_PFNMAP pfn.  In that case,
 658                  * batch->pages isn't valid (there's no struct page), so allow
 659                  * batch->pages to be touched only when there's more than one
 660                  * pfn to check, which guarantees the pfns are from a
 661                  * !VM_PFNMAP vma.
 662                  */
 663                 while (true) {
 664                         if (pfn != *pfn_base + pinned ||
 665                             rsvd != is_invalid_reserved_pfn(pfn))
 666                                 goto out;
 667
 668                         /*
 669                          * Reserved pages aren't counted against the user,
 670                          * externally pinned pages are already counted against
 671                          * the user.
 672                          */
 673                         if (!rsvd && !vfio_find_vpfn(dma, iova)) {
 674                                 if (!dma->lock_cap &&
 675                                     mm->locked_vm + lock_acct + 1 > limit) {
 676                                         pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
 677                                                 __func__, limit << PAGE_SHIFT);
 678                                         ret = -ENOMEM;
 679                                         goto unpin_out;
 680                                 }
 681                                 lock_acct++;
 682                         }
 683
 684                         pinned++;
 685                         npage--;
 686                         vaddr += PAGE_SIZE;
 687                         iova += PAGE_SIZE;
 688                         batch->offset++;
 689                         batch->size--;
 690
 691                         if (!batch->size)
 692                                 break;
 693
 694                         pfn = page_to_pfn(batch->pages[batch->offset]);
 695                 }
 696
 697                 if (unlikely(disable_hugepages))
 698                         break;
 699         }
 700
 701 out:
 702         ret = vfio_lock_acct(dma, lock_acct, false);
 703
 704 unpin_out:
 705         if (batch->size == 1 && !batch->offset) {
 706                 /* May be a VM_PFNMAP pfn, which the batch can't remember. */
 707                 put_pfn(pfn, dma->prot);
 708                 batch->size = 0;
 709         }
 710
 711         if (ret < 0) {
 712                 if (pinned && !rsvd) {
 713                         for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
 714                                 put_pfn(pfn, dma->prot);
 715                 }
 716                 vfio_batch_unpin(batch, dma);
 717
 718                 return ret;
 719         }
 720
 721         return pinned;
 722 }
 723
 724 static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
 725                                     unsigned long pfn, long npage,
 726                                     bool do_accounting)
 727 {
 728         long unlocked = 0, locked = 0;
 729         long i;
 730
 731         for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
 732                 if (put_pfn(pfn++, dma->prot)) {
 733                         unlocked++;
 734                         if (vfio_find_vpfn(dma, iova))
 735                                 locked++;
 736                 }
 737         }
 738
 739         if (do_accounting)
 740                 vfio_lock_acct(dma, locked - unlocked, true);
 741
 742         return unlocked;
 743 }
 744
 745 static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
 746                                   unsigned long *pfn_base, bool do_accounting)
 747 {
 748         struct page *pages[1];
 749         struct mm_struct *mm;
 750         int ret;
 751
 752         mm = dma->mm;
 753         if (!mmget_not_zero(mm))
 754                 return -ENODEV;
 755
 756         ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages);
 757         if (ret != 1)
 758                 goto out;
 759
 760         ret = 0;
 761
 762         if (do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
 763                 ret = vfio_lock_acct(dma, 1, false);
 764                 if (ret) {
 765                         put_pfn(*pfn_base, dma->prot);
 766                         if (ret == -ENOMEM)
 767                                 pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK "
 768                                         "(%ld) exceeded\n", __func__,
 769                                         dma->task->comm, task_pid_nr(dma->task),
 770                                         task_rlimit(dma->task, RLIMIT_MEMLOCK));
 771                 }
 772         }
 773
 774 out:
 775         mmput(mm);
 776         return ret;
 777 }
 778
 779 static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
 780                                     bool do_accounting)
 781 {
 782         int unlocked;
 783         struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
 784
 785         if (!vpfn)
 786                 return 0;
 787
 788         unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
 789
 790         if (do_accounting)
 791                 vfio_lock_acct(dma, -unlocked, true);
 792
 793         return unlocked;
 794 }
 795
 796 static int vfio_iommu_type1_pin_pages(void *iommu_data,
 797                                       struct iommu_group *iommu_group,
 798                                       dma_addr_t user_iova,
 799                                       int npage, int prot,
 800                                       struct page **pages)
 801 {
 802         struct vfio_iommu *iommu = iommu_data;
 803         struct vfio_iommu_group *group;
 804         int i, j, ret;
 805         unsigned long remote_vaddr;
 806         struct vfio_dma *dma;
 807         bool do_accounting;
 808
 809         if (!iommu || !pages)
 810                 return -EINVAL;
 811
 812         /* Supported for v2 version only */
 813         if (!iommu->v2)
 814                 return -EACCES;
 815
 816         mutex_lock(&iommu->lock);
 817
 818         if (WARN_ONCE(iommu->vaddr_invalid_count,
 819                       "vfio_pin_pages not allowed with VFIO_UPDATE_VADDR\n")) {
 820                 ret = -EBUSY;
 821                 goto pin_done;
 822         }
 823
 824         /* Fail if no dma_umap notifier is registered */
 825         if (list_empty(&iommu->device_list)) {
 826                 ret = -EINVAL;
 827                 goto pin_done;
 828         }
 829
 830         /*
 831          * If iommu capable domain exist in the container then all pages are
 832          * already pinned and accounted. Accounting should be done if there is no
 833          * iommu capable domain in the container.
 834          */
 835         do_accounting = list_empty(&iommu->domain_list);
 836
 837         for (i = 0; i < npage; i++) {
 838                 unsigned long phys_pfn;
 839                 dma_addr_t iova;
 840                 struct vfio_pfn *vpfn;
 841
 842                 iova = user_iova + PAGE_SIZE * i;
 843                 dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
 844                 if (!dma) {
 845                         ret = -EINVAL;
 846                         goto pin_unwind;
 847                 }
 848
 849                 if ((dma->prot & prot) != prot) {
 850                         ret = -EPERM;
 851                         goto pin_unwind;
 852                 }
 853
 854                 vpfn = vfio_iova_get_vfio_pfn(dma, iova);
 855                 if (vpfn) {
 856                         pages[i] = pfn_to_page(vpfn->pfn);
 857                         continue;
 858                 }
 859
 860                 remote_vaddr = dma->vaddr + (iova - dma->iova);
 861                 ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn,
 862                                              do_accounting);
 863                 if (ret)
 864                         goto pin_unwind;
 865
 866                 if (!pfn_valid(phys_pfn)) {
 867                         ret = -EINVAL;
 868                         goto pin_unwind;
 869                 }
 870
 871                 ret = vfio_add_to_pfn_list(dma, iova, phys_pfn);
 872                 if (ret) {
 873                         if (put_pfn(phys_pfn, dma->prot) && do_accounting)
 874                                 vfio_lock_acct(dma, -1, true);
 875                         goto pin_unwind;
 876                 }
 877
 878                 pages[i] = pfn_to_page(phys_pfn);
 879
 880                 if (iommu->dirty_page_tracking) {
 881                         unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
 882
 883                         /*
 884                          * Bitmap populated with the smallest supported page
 885                          * size
 886                          */
 887                         bitmap_set(dma->bitmap,
 888                                    (iova - dma->iova) >> pgshift, 1);
 889                 }
 890         }
 891         ret = i;
 892
 893         group = vfio_iommu_find_iommu_group(iommu, iommu_group);
 894         if (!group->pinned_page_dirty_scope) {
 895                 group->pinned_page_dirty_scope = true;
 896                 iommu->num_non_pinned_groups--;
 897         }
 898
 899         goto pin_done;
 900
 901 pin_unwind:
 902         pages[i] = NULL;
 903         for (j = 0; j < i; j++) {
 904                 dma_addr_t iova;
 905
 906                 iova = user_iova + PAGE_SIZE * j;
 907                 dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
 908                 vfio_unpin_page_external(dma, iova, do_accounting);
 909                 pages[j] = NULL;
 910         }
 911 pin_done:
 912         mutex_unlock(&iommu->lock);
 913         return ret;
 914 }
 915
 916 static void vfio_iommu_type1_unpin_pages(void *iommu_data,
 917                                          dma_addr_t user_iova, int npage)
 918 {
 919         struct vfio_iommu *iommu = iommu_data;
 920         bool do_accounting;
 921         int i;
 922
 923         /* Supported for v2 version only */
 924         if (WARN_ON(!iommu->v2))
 925                 return;
 926
 927         mutex_lock(&iommu->lock);
 928
 929         do_accounting = list_empty(&iommu->domain_list);
 930         for (i = 0; i < npage; i++) {
 931                 dma_addr_t iova = user_iova + PAGE_SIZE * i;
 932                 struct vfio_dma *dma;
 933
 934                 dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
 935                 if (!dma)
 936                         break;
 937
 938                 vfio_unpin_page_external(dma, iova, do_accounting);
 939         }
 940
 941         mutex_unlock(&iommu->lock);
 942
 943         WARN_ON(i != npage);
 944 }
 945
 946 static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
 947                             struct list_head *regions,
 948                             struct iommu_iotlb_gather *iotlb_gather)
 949 {
 950         long unlocked = 0;
 951         struct vfio_regions *entry, *next;
 952
 953         iommu_iotlb_sync(domain->domain, iotlb_gather);
 954
 955         list_for_each_entry_safe(entry, next, regions, list) {
 956                 unlocked += vfio_unpin_pages_remote(dma,
 957                                                     entry->iova,
 958                                                     entry->phys >> PAGE_SHIFT,
 959                                                     entry->len >> PAGE_SHIFT,
 960                                                     false);
 961                 list_del(&entry->list);
 962                 kfree(entry);
 963         }
 964
 965         cond_resched();
 966
 967         return unlocked;
 968 }
 969
 970 /*
 971  * Generally, VFIO needs to unpin remote pages after each IOTLB flush.
 972  * Therefore, when using IOTLB flush sync interface, VFIO need to keep track
 973  * of these regions (currently using a list).
 974  *
 975  * This value specifies maximum number of regions for each IOTLB flush sync.
 976  */
 977 #define VFIO_IOMMU_TLB_SYNC_MAX         512
 978
 979 static size_t unmap_unpin_fast(struct vfio_domain *domain,
 980                                struct vfio_dma *dma, dma_addr_t *iova,
 981                                size_t len, phys_addr_t phys, long *unlocked,
 982                                struct list_head *unmapped_list,
 983                                int *unmapped_cnt,
 984                                struct iommu_iotlb_gather *iotlb_gather)
 985 {
 986         size_t unmapped = 0;
 987         struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
 988
 989         if (entry) {
 990                 unmapped = iommu_unmap_fast(domain->domain, *iova, len,
 991                                             iotlb_gather);
 992
 993                 if (!unmapped) {
 994                         kfree(entry);
 995                 } else {
 996                         entry->iova = *iova;
 997                         entry->phys = phys;
 998                         entry->len  = unmapped;
 999                         list_add_tail(&entry->list, unmapped_list);
1000
1001                         *iova += unmapped;
1002                         (*unmapped_cnt)++;
1003                 }
1004         }
1005
1006         /*
1007          * Sync if the number of fast-unmap regions hits the limit
1008          * or in case of errors.
1009          */
1010         if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) {
1011                 *unlocked += vfio_sync_unpin(dma, domain, unmapped_list,
1012                                              iotlb_gather);
1013                 *unmapped_cnt = 0;
1014         }
1015
1016         return unmapped;
1017 }
1018
1019 static size_t unmap_unpin_slow(struct vfio_domain *domain,
1020                                struct vfio_dma *dma, dma_addr_t *iova,
1021                                size_t len, phys_addr_t phys,
1022                                long *unlocked)
1023 {
1024         size_t unmapped = iommu_unmap(domain->domain, *iova, len);
1025
1026         if (unmapped) {
1027                 *unlocked += vfio_unpin_pages_remote(dma, *iova,
1028                                                      phys >> PAGE_SHIFT,
1029                                                      unmapped >> PAGE_SHIFT,
1030                                                      false);
1031                 *iova += unmapped;
1032                 cond_resched();
1033         }
1034         return unmapped;
1035 }
1036
1037 static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
1038                              bool do_accounting)
1039 {
1040         dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
1041         struct vfio_domain *domain, *d;
1042         LIST_HEAD(unmapped_region_list);
1043         struct iommu_iotlb_gather iotlb_gather;
1044         int unmapped_region_cnt = 0;
1045         long unlocked = 0;
1046
1047         if (!dma->size)
1048                 return 0;
1049
1050         if (list_empty(&iommu->domain_list))
1051                 return 0;
1052
1053         /*
1054          * We use the IOMMU to track the physical addresses, otherwise we'd
1055          * need a much more complicated tracking system.  Unfortunately that
1056          * means we need to use one of the iommu domains to figure out the
1057          * pfns to unpin.  The rest need to be unmapped in advance so we have
1058          * no iommu translations remaining when the pages are unpinned.
1059          */
1060         domain = d = list_first_entry(&iommu->domain_list,
1061                                       struct vfio_domain, next);
1062
1063         list_for_each_entry_continue(d, &iommu->domain_list, next) {
1064                 iommu_unmap(d->domain, dma->iova, dma->size);
1065                 cond_resched();
1066         }
1067
1068         iommu_iotlb_gather_init(&iotlb_gather);
1069         while (iova < end) {
1070                 size_t unmapped, len;
1071                 phys_addr_t phys, next;
1072
1073                 phys = iommu_iova_to_phys(domain->domain, iova);
1074                 if (WARN_ON(!phys)) {
1075                         iova += PAGE_SIZE;
1076                         continue;
1077                 }
1078
1079                 /*
1080                  * To optimize for fewer iommu_unmap() calls, each of which
1081                  * may require hardware cache flushing, try to find the
1082                  * largest contiguous physical memory chunk to unmap.
1083                  */
1084                 for (len = PAGE_SIZE;
1085                      !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
1086                         next = iommu_iova_to_phys(domain->domain, iova + len);
1087                         if (next != phys + len)
1088                                 break;
1089                 }
1090
1091                 /*
1092                  * First, try to use fast unmap/unpin. In case of failure,
1093                  * switch to slow unmap/unpin path.
1094                  */
1095                 unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys,
1096                                             &unlocked, &unmapped_region_list,
1097                                             &unmapped_region_cnt,
1098                                             &iotlb_gather);
1099                 if (!unmapped) {
1100                         unmapped = unmap_unpin_slow(domain, dma, &iova, len,
1101                                                     phys, &unlocked);
1102                         if (WARN_ON(!unmapped))
1103                                 break;
1104                 }
1105         }
1106
1107         dma->iommu_mapped = false;
1108
1109         if (unmapped_region_cnt) {
1110                 unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list,
1111                                             &iotlb_gather);
1112         }
1113
1114         if (do_accounting) {
1115                 vfio_lock_acct(dma, -unlocked, true);
1116                 return 0;
1117         }
1118         return unlocked;
1119 }
1120
1121 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
1122 {
1123         WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list));
1124         vfio_unmap_unpin(iommu, dma, true);
1125         vfio_unlink_dma(iommu, dma);
1126         put_task_struct(dma->task);
1127         mmdrop(dma->mm);
1128         vfio_dma_bitmap_free(dma);
1129         if (dma->vaddr_invalid)
1130                 iommu->vaddr_invalid_count--;
1131         kfree(dma);
1132         iommu->dma_avail++;
1133 }
1134
1135 static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu)
1136 {
1137         struct vfio_domain *domain;
1138
1139         iommu->pgsize_bitmap = ULONG_MAX;
1140
1141         list_for_each_entry(domain, &iommu->domain_list, next)
1142                 iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap;
1143
1144         /*
1145          * In case the IOMMU supports page sizes smaller than PAGE_SIZE
1146          * we pretend PAGE_SIZE is supported and hide sub-PAGE_SIZE sizes.
1147          * That way the user will be able to map/unmap buffers whose size/
1148          * start address is aligned with PAGE_SIZE. Pinning code uses that
1149          * granularity while iommu driver can use the sub-PAGE_SIZE size
1150          * to map the buffer.
1151          */
1152         if (iommu->pgsize_bitmap & ~PAGE_MASK) {
1153                 iommu->pgsize_bitmap &= PAGE_MASK;
1154                 iommu->pgsize_bitmap |= PAGE_SIZE;
1155         }
1156 }
1157
1158 static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
1159                               struct vfio_dma *dma, dma_addr_t base_iova,
1160                               size_t pgsize)
1161 {
1162         unsigned long pgshift = __ffs(pgsize);
1163         unsigned long nbits = dma->size >> pgshift;
1164         unsigned long bit_offset = (dma->iova - base_iova) >> pgshift;
1165         unsigned long copy_offset = bit_offset / BITS_PER_LONG;
1166         unsigned long shift = bit_offset % BITS_PER_LONG;
1167         unsigned long leftover;
1168
1169         /*
1170          * mark all pages dirty if any IOMMU capable device is not able
1171          * to report dirty pages and all pages are pinned and mapped.
1172          */
1173         if (iommu->num_non_pinned_groups && dma->iommu_mapped)
1174                 bitmap_set(dma->bitmap, 0, nbits);
1175
1176         if (shift) {
1177                 bitmap_shift_left(dma->bitmap, dma->bitmap, shift,
1178                                   nbits + shift);
1179
1180                 if (copy_from_user(&leftover,
1181                                    (void __user *)(bitmap + copy_offset),
1182                                    sizeof(leftover)))
1183                         return -EFAULT;
1184
1185                 bitmap_or(dma->bitmap, dma->bitmap, &leftover, shift);
1186         }
1187
1188         if (copy_to_user((void __user *)(bitmap + copy_offset), dma->bitmap,
1189                          DIRTY_BITMAP_BYTES(nbits + shift)))
1190                 return -EFAULT;
1191
1192         return 0;
1193 }
1194
1195 static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
1196                                   dma_addr_t iova, size_t size, size_t pgsize)
1197 {
1198         struct vfio_dma *dma;
1199         struct rb_node *n;
1200         unsigned long pgshift = __ffs(pgsize);
1201         int ret;
1202
1203         /*
1204          * GET_BITMAP request must fully cover vfio_dma mappings.  Multiple
1205          * vfio_dma mappings may be clubbed by specifying large ranges, but
1206          * there must not be any previous mappings bisected by the range.
1207          * An error will be returned if these conditions are not met.
1208          */
1209         dma = vfio_find_dma(iommu, iova, 1);
1210         if (dma && dma->iova != iova)
1211                 return -EINVAL;
1212
1213         dma = vfio_find_dma(iommu, iova + size - 1, 0);
1214         if (dma && dma->iova + dma->size != iova + size)
1215                 return -EINVAL;
1216
1217         for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
1218                 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1219
1220                 if (dma->iova < iova)
1221                         continue;
1222
1223                 if (dma->iova > iova + size - 1)
1224                         break;
1225
1226                 ret = update_user_bitmap(bitmap, iommu, dma, iova, pgsize);
1227                 if (ret)
1228                         return ret;
1229
1230                 /*
1231                  * Re-populate bitmap to include all pinned pages which are
1232                  * considered as dirty but exclude pages which are unpinned and
1233                  * pages which are marked dirty by vfio_dma_rw()
1234                  */
1235                 bitmap_clear(dma->bitmap, 0, dma->size >> pgshift);
1236                 vfio_dma_populate_bitmap(dma, pgsize);
1237         }
1238         return 0;
1239 }
1240
1241 static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
1242 {
1243         if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX) ||
1244             (bitmap_size < DIRTY_BITMAP_BYTES(npages)))
1245                 return -EINVAL;
1246
1247         return 0;
1248 }
1249
1250 /*
1251  * Notify VFIO drivers using vfio_register_emulated_iommu_dev() to invalidate
1252  * and unmap iovas within the range we're about to unmap. Drivers MUST unpin
1253  * pages in response to an invalidation.
1254  */
1255 static void vfio_notify_dma_unmap(struct vfio_iommu *iommu,
1256                                   struct vfio_dma *dma)
1257 {
1258         struct vfio_device *device;
1259
1260         if (list_empty(&iommu->device_list))
1261                 return;
1262
1263         /*
1264          * The device is expected to call vfio_unpin_pages() for any IOVA it has
1265          * pinned within the range. Since vfio_unpin_pages() will eventually
1266          * call back down to this code and try to obtain the iommu->lock we must
1267          * drop it.
1268          */
1269         mutex_lock(&iommu->device_list_lock);
1270         mutex_unlock(&iommu->lock);
1271
1272         list_for_each_entry(device, &iommu->device_list, iommu_entry)
1273                 device->ops->dma_unmap(device, dma->iova, dma->size);
1274
1275         mutex_unlock(&iommu->device_list_lock);
1276         mutex_lock(&iommu->lock);
1277 }
1278
1279 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
1280                              struct vfio_iommu_type1_dma_unmap *unmap,
1281                              struct vfio_bitmap *bitmap)
1282 {
1283         struct vfio_dma *dma, *dma_last = NULL;
1284         size_t unmapped = 0, pgsize;
1285         int ret = -EINVAL, retries = 0;
1286         unsigned long pgshift;
1287         dma_addr_t iova = unmap->iova;
1288         u64 size = unmap->size;
1289         bool unmap_all = unmap->flags & VFIO_DMA_UNMAP_FLAG_ALL;
1290         bool invalidate_vaddr = unmap->flags & VFIO_DMA_UNMAP_FLAG_VADDR;
1291         struct rb_node *n, *first_n;
1292
1293         mutex_lock(&iommu->lock);
1294
1295         /* Cannot update vaddr if mdev is present. */
1296         if (invalidate_vaddr && !list_empty(&iommu->emulated_iommu_groups)) {
1297                 ret = -EBUSY;
1298                 goto unlock;
1299         }
1300
1301         pgshift = __ffs(iommu->pgsize_bitmap);
1302         pgsize = (size_t)1 << pgshift;
1303
1304         if (iova & (pgsize - 1))
1305                 goto unlock;
1306
1307         if (unmap_all) {
1308                 if (iova || size)
1309                         goto unlock;
1310                 size = U64_MAX;
1311         } else if (!size || size & (pgsize - 1) ||
1312                    iova + size - 1 < iova || size > SIZE_MAX) {
1313                 goto unlock;
1314         }
1315
1316         /* When dirty tracking is enabled, allow only min supported pgsize */
1317         if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
1318             (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
1319                 goto unlock;
1320         }
1321
1322         WARN_ON((pgsize - 1) & PAGE_MASK);
1323 again:
1324         /*
1325          * vfio-iommu-type1 (v1) - User mappings were coalesced together to
1326          * avoid tracking individual mappings.  This means that the granularity
1327          * of the original mapping was lost and the user was allowed to attempt
1328          * to unmap any range.  Depending on the contiguousness of physical
1329          * memory and page sizes supported by the IOMMU, arbitrary unmaps may
1330          * or may not have worked.  We only guaranteed unmap granularity
1331          * matching the original mapping; even though it was untracked here,
1332          * the original mappings are reflected in IOMMU mappings.  This
1333          * resulted in a couple unusual behaviors.  First, if a range is not
1334          * able to be unmapped, ex. a set of 4k pages that was mapped as a
1335          * 2M hugepage into the IOMMU, the unmap ioctl returns success but with
1336          * a zero sized unmap.  Also, if an unmap request overlaps the first
1337          * address of a hugepage, the IOMMU will unmap the entire hugepage.
1338          * This also returns success and the returned unmap size reflects the
1339          * actual size unmapped.
1340          *
1341          * We attempt to maintain compatibility with this "v1" interface, but
1342          * we take control out of the hands of the IOMMU.  Therefore, an unmap
1343          * request offset from the beginning of the original mapping will
1344          * return success with zero sized unmap.  And an unmap request covering
1345          * the first iova of mapping will unmap the entire range.
1346          *
1347          * The v2 version of this interface intends to be more deterministic.
1348          * Unmap requests must fully cover previous mappings.  Multiple
1349          * mappings may still be unmaped by specifying large ranges, but there
1350          * must not be any previous mappings bisected by the range.  An error
1351          * will be returned if these conditions are not met.  The v2 interface
1352          * will only return success and a size of zero if there were no
1353          * mappings within the range.
1354          */
1355         if (iommu->v2 && !unmap_all) {
1356                 dma = vfio_find_dma(iommu, iova, 1);
1357                 if (dma && dma->iova != iova)
1358                         goto unlock;
1359
1360                 dma = vfio_find_dma(iommu, iova + size - 1, 0);
1361                 if (dma && dma->iova + dma->size != iova + size)
1362                         goto unlock;
1363         }
1364
1365         ret = 0;
1366         n = first_n = vfio_find_dma_first_node(iommu, iova, size);
1367
1368         while (n) {
1369                 dma = rb_entry(n, struct vfio_dma, node);
1370                 if (dma->iova >= iova + size)
1371                         break;
1372
1373                 if (!iommu->v2 && iova > dma->iova)
1374                         break;
1375
1376                 if (invalidate_vaddr) {
1377                         if (dma->vaddr_invalid) {
1378                                 struct rb_node *last_n = n;
1379
1380                                 for (n = first_n; n != last_n; n = rb_next(n)) {
1381                                         dma = rb_entry(n,
1382                                                        struct vfio_dma, node);
1383                                         dma->vaddr_invalid = false;
1384                                         iommu->vaddr_invalid_count--;
1385                                 }
1386                                 ret = -EINVAL;
1387                                 unmapped = 0;
1388                                 break;
1389                         }
1390                         dma->vaddr_invalid = true;
1391                         iommu->vaddr_invalid_count++;
1392                         unmapped += dma->size;
1393                         n = rb_next(n);
1394                         continue;
1395                 }
1396
1397                 if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
1398                         if (dma_last == dma) {
1399                                 BUG_ON(++retries > 10);
1400                         } else {
1401                                 dma_last = dma;
1402                                 retries = 0;
1403                         }
1404
1405                         vfio_notify_dma_unmap(iommu, dma);
1406                         goto again;
1407                 }
1408
1409                 if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
1410                         ret = update_user_bitmap(bitmap->data, iommu, dma,
1411                                                  iova, pgsize);
1412                         if (ret)
1413                                 break;
1414                 }
1415
1416                 unmapped += dma->size;
1417                 n = rb_next(n);
1418                 vfio_remove_dma(iommu, dma);
1419         }
1420
1421 unlock:
1422         mutex_unlock(&iommu->lock);
1423
1424         /* Report how much was unmapped */
1425         unmap->size = unmapped;
1426
1427         return ret;
1428 }
1429
1430 static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
1431                           unsigned long pfn, long npage, int prot)
1432 {
1433         struct vfio_domain *d;
1434         int ret;
1435
1436         list_for_each_entry(d, &iommu->domain_list, next) {
1437                 ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
1438                                 npage << PAGE_SHIFT, prot | IOMMU_CACHE,
1439                                 GFP_KERNEL);
1440                 if (ret)
1441                         goto unwind;
1442
1443                 cond_resched();
1444         }
1445
1446         return 0;
1447
1448 unwind:
1449         list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) {
1450                 iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
1451                 cond_resched();
1452         }
1453
1454         return ret;
1455 }
1456
1457 static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
1458                             size_t map_size)
1459 {
1460         dma_addr_t iova = dma->iova;
1461         unsigned long vaddr = dma->vaddr;
1462         struct vfio_batch batch;
1463         size_t size = map_size;
1464         long npage;
1465         unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1466         int ret = 0;
1467
1468         vfio_batch_init(&batch);
1469
1470         while (size) {
1471                 /* Pin a contiguous chunk of memory */
1472                 npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
1473                                               size >> PAGE_SHIFT, &pfn, limit,
1474                                               &batch);
1475                 if (npage <= 0) {
1476                         WARN_ON(!npage);
1477                         ret = (int)npage;
1478                         break;
1479                 }
1480
1481                 /* Map it! */
1482                 ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
1483                                      dma->prot);
1484                 if (ret) {
1485                         vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
1486                                                 npage, true);
1487                         vfio_batch_unpin(&batch, dma);
1488                         break;
1489                 }
1490
1491                 size -= npage << PAGE_SHIFT;
1492                 dma->size += npage << PAGE_SHIFT;
1493         }
1494
1495         vfio_batch_fini(&batch);
1496         dma->iommu_mapped = true;
1497
1498         if (ret)
1499                 vfio_remove_dma(iommu, dma);
1500
1501         return ret;
1502 }
1503
1504 /*
1505  * Check dma map request is within a valid iova range
1506  */
1507 static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
1508                                       dma_addr_t start, dma_addr_t end)
1509 {
1510         struct list_head *iova = &iommu->iova_list;
1511         struct vfio_iova *node;
1512
1513         list_for_each_entry(node, iova, list) {
1514                 if (start >= node->start && end <= node->end)
1515                         return true;
1516         }
1517
1518         /*
1519          * Check for list_empty() as well since a container with
1520          * a single mdev device will have an empty list.
1521          */
1522         return list_empty(iova);
1523 }
1524
1525 static int vfio_change_dma_owner(struct vfio_dma *dma)
1526 {
1527         struct task_struct *task = current->group_leader;
1528         struct mm_struct *mm = current->mm;
1529         long npage = dma->locked_vm;
1530         bool lock_cap;
1531         int ret;
1532
1533         if (mm == dma->mm)
1534                 return 0;
1535
1536         lock_cap = capable(CAP_IPC_LOCK);
1537         ret = mm_lock_acct(task, mm, lock_cap, npage);
1538         if (ret)
1539                 return ret;
1540
1541         if (mmget_not_zero(dma->mm)) {
1542                 mm_lock_acct(dma->task, dma->mm, dma->lock_cap, -npage);
1543                 mmput(dma->mm);
1544         }
1545
1546         if (dma->task != task) {
1547                 put_task_struct(dma->task);
1548                 dma->task = get_task_struct(task);
1549         }
1550         mmdrop(dma->mm);
1551         dma->mm = mm;
1552         mmgrab(dma->mm);
1553         dma->lock_cap = lock_cap;
1554         return 0;
1555 }
1556
1557 static int vfio_dma_do_map(struct vfio_iommu *iommu,
1558                            struct vfio_iommu_type1_dma_map *map)
1559 {
1560         bool set_vaddr = map->flags & VFIO_DMA_MAP_FLAG_VADDR;
1561         dma_addr_t iova = map->iova;
1562         unsigned long vaddr = map->vaddr;
1563         size_t size = map->size;
1564         int ret = 0, prot = 0;
1565         size_t pgsize;
1566         struct vfio_dma *dma;
1567
1568         /* Verify that none of our __u64 fields overflow */
1569         if (map->size != size || map->vaddr != vaddr || map->iova != iova)
1570                 return -EINVAL;
1571
1572         /* READ/WRITE from device perspective */
1573         if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
1574                 prot |= IOMMU_WRITE;
1575         if (map->flags & VFIO_DMA_MAP_FLAG_READ)
1576                 prot |= IOMMU_READ;
1577
1578         if ((prot && set_vaddr) || (!prot && !set_vaddr))
1579                 return -EINVAL;
1580
1581         mutex_lock(&iommu->lock);
1582
1583         pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
1584
1585         WARN_ON((pgsize - 1) & PAGE_MASK);
1586
1587         if (!size || (size | iova | vaddr) & (pgsize - 1)) {
1588                 ret = -EINVAL;
1589                 goto out_unlock;
1590         }
1591
1592         /* Don't allow IOVA or virtual address wrap */
1593         if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) {
1594                 ret = -EINVAL;
1595                 goto out_unlock;
1596         }
1597
1598         dma = vfio_find_dma(iommu, iova, size);
1599         if (set_vaddr) {
1600                 if (!dma) {
1601                         ret = -ENOENT;
1602                 } else if (!dma->vaddr_invalid || dma->iova != iova ||
1603                            dma->size != size) {
1604                         ret = -EINVAL;
1605                 } else {
1606                         ret = vfio_change_dma_owner(dma);
1607                         if (ret)
1608                                 goto out_unlock;
1609                         dma->vaddr = vaddr;
1610                         dma->vaddr_invalid = false;
1611                         iommu->vaddr_invalid_count--;
1612                 }
1613                 goto out_unlock;
1614         } else if (dma) {
1615                 ret = -EEXIST;
1616                 goto out_unlock;
1617         }
1618
1619         if (!iommu->dma_avail) {
1620                 ret = -ENOSPC;
1621                 goto out_unlock;
1622         }
1623
1624         if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
1625                 ret = -EINVAL;
1626                 goto out_unlock;
1627         }
1628
1629         dma = kzalloc(sizeof(*dma), GFP_KERNEL);
1630         if (!dma) {
1631                 ret = -ENOMEM;
1632                 goto out_unlock;
1633         }
1634
1635         iommu->dma_avail--;
1636         dma->iova = iova;
1637         dma->vaddr = vaddr;
1638         dma->prot = prot;
1639
1640         /*
1641          * We need to be able to both add to a task's locked memory and test
1642          * against the locked memory limit and we need to be able to do both
1643          * outside of this call path as pinning can be asynchronous via the
1644          * external interfaces for mdev devices.  RLIMIT_MEMLOCK requires a
1645          * task_struct. Save the group_leader so that all DMA tracking uses
1646          * the same task, to make debugging easier.  VM locked pages requires
1647          * an mm_struct, so grab the mm in case the task dies.
1648          */
1649         get_task_struct(current->group_leader);
1650         dma->task = current->group_leader;
1651         dma->lock_cap = capable(CAP_IPC_LOCK);
1652         dma->mm = current->mm;
1653         mmgrab(dma->mm);
1654
1655         dma->pfn_list = RB_ROOT;
1656
1657         /* Insert zero-sized and grow as we map chunks of it */
1658         vfio_link_dma(iommu, dma);
1659
1660         /* Don't pin and map if container doesn't contain IOMMU capable domain*/
1661         if (list_empty(&iommu->domain_list))
1662                 dma->size = size;
1663         else
1664                 ret = vfio_pin_map_dma(iommu, dma, size);
1665
1666         if (!ret && iommu->dirty_page_tracking) {
1667                 ret = vfio_dma_bitmap_alloc(dma, pgsize);
1668                 if (ret)
1669                         vfio_remove_dma(iommu, dma);
1670         }
1671
1672 out_unlock:
1673         mutex_unlock(&iommu->lock);
1674         return ret;
1675 }
1676
1677 static int vfio_iommu_replay(struct vfio_iommu *iommu,
1678                              struct vfio_domain *domain)
1679 {
1680         struct vfio_batch batch;
1681         struct vfio_domain *d = NULL;
1682         struct rb_node *n;
1683         unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1684         int ret;
1685
1686         /* Arbitrarily pick the first domain in the list for lookups */
1687         if (!list_empty(&iommu->domain_list))
1688                 d = list_first_entry(&iommu->domain_list,
1689                                      struct vfio_domain, next);
1690
1691         vfio_batch_init(&batch);
1692
1693         n = rb_first(&iommu->dma_list);
1694
1695         for (; n; n = rb_next(n)) {
1696                 struct vfio_dma *dma;
1697                 dma_addr_t iova;
1698
1699                 dma = rb_entry(n, struct vfio_dma, node);
1700                 iova = dma->iova;
1701
1702                 while (iova < dma->iova + dma->size) {
1703                         phys_addr_t phys;
1704                         size_t size;
1705
1706                         if (dma->iommu_mapped) {
1707                                 phys_addr_t p;
1708                                 dma_addr_t i;
1709
1710                                 if (WARN_ON(!d)) { /* mapped w/o a domain?! */
1711                                         ret = -EINVAL;
1712                                         goto unwind;
1713                                 }
1714
1715                                 phys = iommu_iova_to_phys(d->domain, iova);
1716
1717                                 if (WARN_ON(!phys)) {
1718                                         iova += PAGE_SIZE;
1719                                         continue;
1720                                 }
1721
1722                                 size = PAGE_SIZE;
1723                                 p = phys + size;
1724                                 i = iova + size;
1725                                 while (i < dma->iova + dma->size &&
1726                                        p == iommu_iova_to_phys(d->domain, i)) {
1727                                         size += PAGE_SIZE;
1728                                         p += PAGE_SIZE;
1729                                         i += PAGE_SIZE;
1730                                 }
1731                         } else {
1732                                 unsigned long pfn;
1733                                 unsigned long vaddr = dma->vaddr +
1734                                                      (iova - dma->iova);
1735                                 size_t n = dma->iova + dma->size - iova;
1736                                 long npage;
1737
1738                                 npage = vfio_pin_pages_remote(dma, vaddr,
1739                                                               n >> PAGE_SHIFT,
1740                                                               &pfn, limit,
1741                                                               &batch);
1742                                 if (npage <= 0) {
1743                                         WARN_ON(!npage);
1744                                         ret = (int)npage;
1745                                         goto unwind;
1746                                 }
1747
1748                                 phys = pfn << PAGE_SHIFT;
1749                                 size = npage << PAGE_SHIFT;
1750                         }
1751
1752                         ret = iommu_map(domain->domain, iova, phys, size,
1753                                         dma->prot | IOMMU_CACHE, GFP_KERNEL);
1754                         if (ret) {
1755                                 if (!dma->iommu_mapped) {
1756                                         vfio_unpin_pages_remote(dma, iova,
1757                                                         phys >> PAGE_SHIFT,
1758                                                         size >> PAGE_SHIFT,
1759                                                         true);
1760                                         vfio_batch_unpin(&batch, dma);
1761                                 }
1762                                 goto unwind;
1763                         }
1764
1765                         iova += size;
1766                 }
1767         }
1768
1769         /* All dmas are now mapped, defer to second tree walk for unwind */
1770         for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
1771                 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1772
1773                 dma->iommu_mapped = true;
1774         }
1775
1776         vfio_batch_fini(&batch);
1777         return 0;
1778
1779 unwind:
1780         for (; n; n = rb_prev(n)) {
1781                 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1782                 dma_addr_t iova;
1783
1784                 if (dma->iommu_mapped) {
1785                         iommu_unmap(domain->domain, dma->iova, dma->size);
1786                         continue;
1787                 }
1788
1789                 iova = dma->iova;
1790                 while (iova < dma->iova + dma->size) {
1791                         phys_addr_t phys, p;
1792                         size_t size;
1793                         dma_addr_t i;
1794
1795                         phys = iommu_iova_to_phys(domain->domain, iova);
1796                         if (!phys) {
1797                                 iova += PAGE_SIZE;
1798                                 continue;
1799                         }
1800
1801                         size = PAGE_SIZE;
1802                         p = phys + size;
1803                         i = iova + size;
1804                         while (i < dma->iova + dma->size &&
1805                                p == iommu_iova_to_phys(domain->domain, i)) {
1806                                 size += PAGE_SIZE;
1807                                 p += PAGE_SIZE;
1808                                 i += PAGE_SIZE;
1809                         }
1810
1811                         iommu_unmap(domain->domain, iova, size);
1812                         vfio_unpin_pages_remote(dma, iova, phys >> PAGE_SHIFT,
1813                                                 size >> PAGE_SHIFT, true);
1814                 }
1815         }
1816
1817         vfio_batch_fini(&batch);
1818         return ret;
1819 }
1820
1821 /*
1822  * We change our unmap behavior slightly depending on whether the IOMMU
1823  * supports fine-grained superpages.  IOMMUs like AMD-Vi will use a superpage
1824  * for practically any contiguous power-of-two mapping we give it.  This means
1825  * we don't need to look for contiguous chunks ourselves to make unmapping
1826  * more efficient.  On IOMMUs with coarse-grained super pages, like Intel VT-d
1827  * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks
1828  * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when
1829  * hugetlbfs is in use.
1830  */
1831 static void vfio_test_domain_fgsp(struct vfio_domain *domain, struct list_head *regions)
1832 {
1833         int ret, order = get_order(PAGE_SIZE * 2);
1834         struct vfio_iova *region;
1835         struct page *pages;
1836         dma_addr_t start;
1837
1838         pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
1839         if (!pages)
1840                 return;
1841
1842         list_for_each_entry(region, regions, list) {
1843                 start = ALIGN(region->start, PAGE_SIZE * 2);
1844                 if (start >= region->end || (region->end - start < PAGE_SIZE * 2))
1845                         continue;
1846
1847                 ret = iommu_map(domain->domain, start, page_to_phys(pages), PAGE_SIZE * 2,
1848                                 IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE, GFP_KERNEL);
1849                 if (!ret) {
1850                         size_t unmapped = iommu_unmap(domain->domain, start, PAGE_SIZE);
1851
1852                         if (unmapped == PAGE_SIZE)
1853                                 iommu_unmap(domain->domain, start + PAGE_SIZE, PAGE_SIZE);
1854                         else
1855                                 domain->fgsp = true;
1856                 }
1857                 break;
1858         }
1859
1860         __free_pages(pages, order);
1861 }
1862
1863 static struct vfio_iommu_group *find_iommu_group(struct vfio_domain *domain,
1864                                                  struct iommu_group *iommu_group)
1865 {
1866         struct vfio_iommu_group *g;
1867
1868         list_for_each_entry(g, &domain->group_list, next) {
1869                 if (g->iommu_group == iommu_group)
1870                         return g;
1871         }
1872
1873         return NULL;
1874 }
1875
1876 static struct vfio_iommu_group*
1877 vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
1878                             struct iommu_group *iommu_group)
1879 {
1880         struct vfio_iommu_group *group;
1881         struct vfio_domain *domain;
1882
1883         list_for_each_entry(domain, &iommu->domain_list, next) {
1884                 group = find_iommu_group(domain, iommu_group);
1885                 if (group)
1886                         return group;
1887         }
1888
1889         list_for_each_entry(group, &iommu->emulated_iommu_groups, next)
1890                 if (group->iommu_group == iommu_group)
1891                         return group;
1892         return NULL;
1893 }
1894
1895 static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
1896                                   phys_addr_t *base)
1897 {
1898         struct iommu_resv_region *region;
1899         bool ret = false;
1900
1901         list_for_each_entry(region, group_resv_regions, list) {
1902                 /*
1903                  * The presence of any 'real' MSI regions should take
1904                  * precedence over the software-managed one if the
1905                  * IOMMU driver happens to advertise both types.
1906                  */
1907                 if (region->type == IOMMU_RESV_MSI) {
1908                         ret = false;
1909                         break;
1910                 }
1911
1912                 if (region->type == IOMMU_RESV_SW_MSI) {
1913                         *base = region->start;
1914                         ret = true;
1915                 }
1916         }
1917
1918         return ret;
1919 }
1920
1921 /*
1922  * This is a helper function to insert an address range to iova list.
1923  * The list is initially created with a single entry corresponding to
1924  * the IOMMU domain geometry to which the device group is attached.
1925  * The list aperture gets modified when a new domain is added to the
1926  * container if the new aperture doesn't conflict with the current one
1927  * or with any existing dma mappings. The list is also modified to
1928  * exclude any reserved regions associated with the device group.
1929  */
1930 static int vfio_iommu_iova_insert(struct list_head *head,
1931                                   dma_addr_t start, dma_addr_t end)
1932 {
1933         struct vfio_iova *region;
1934
1935         region = kmalloc(sizeof(*region), GFP_KERNEL);
1936         if (!region)
1937                 return -ENOMEM;
1938
1939         INIT_LIST_HEAD(&region->list);
1940         region->start = start;
1941         region->end = end;
1942
1943         list_add_tail(&region->list, head);
1944         return 0;
1945 }
1946
1947 /*
1948  * Check the new iommu aperture conflicts with existing aper or with any
1949  * existing dma mappings.
1950  */
1951 static bool vfio_iommu_aper_conflict(struct vfio_iommu *iommu,
1952                                      dma_addr_t start, dma_addr_t end)
1953 {
1954         struct vfio_iova *first, *last;
1955         struct list_head *iova = &iommu->iova_list;
1956
1957         if (list_empty(iova))
1958                 return false;
1959
1960         /* Disjoint sets, return conflict */
1961         first = list_first_entry(iova, struct vfio_iova, list);
1962         last = list_last_entry(iova, struct vfio_iova, list);
1963         if (start > last->end || end < first->start)
1964                 return true;
1965
1966         /* Check for any existing dma mappings below the new start */
1967         if (start > first->start) {
1968                 if (vfio_find_dma(iommu, first->start, start - first->start))
1969                         return true;
1970         }
1971
1972         /* Check for any existing dma mappings beyond the new end */
1973         if (end < last->end) {
1974                 if (vfio_find_dma(iommu, end + 1, last->end - end))
1975                         return true;
1976         }
1977
1978         return false;
1979 }
1980
1981 /*
1982  * Resize iommu iova aperture window. This is called only if the new
1983  * aperture has no conflict with existing aperture and dma mappings.
1984  */
1985 static int vfio_iommu_aper_resize(struct list_head *iova,
1986                                   dma_addr_t start, dma_addr_t end)
1987 {
1988         struct vfio_iova *node, *next;
1989
1990         if (list_empty(iova))
1991                 return vfio_iommu_iova_insert(iova, start, end);
1992
1993         /* Adjust iova list start */
1994         list_for_each_entry_safe(node, next, iova, list) {
1995                 if (start < node->start)
1996                         break;
1997                 if (start >= node->start && start < node->end) {
1998                         node->start = start;
1999                         break;
2000                 }
2001                 /* Delete nodes before new start */
2002                 list_del(&node->list);
2003                 kfree(node);
2004         }
2005
2006         /* Adjust iova list end */
2007         list_for_each_entry_safe(node, next, iova, list) {
2008                 if (end > node->end)
2009                         continue;
2010                 if (end > node->start && end <= node->end) {
2011                         node->end = end;
2012                         continue;
2013                 }
2014                 /* Delete nodes after new end */
2015                 list_del(&node->list);
2016                 kfree(node);
2017         }
2018
2019         return 0;
2020 }
2021
2022 /*
2023  * Check reserved region conflicts with existing dma mappings
2024  */
2025 static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu,
2026                                      struct list_head *resv_regions)
2027 {
2028         struct iommu_resv_region *region;
2029
2030         /* Check for conflict with existing dma mappings */
2031         list_for_each_entry(region, resv_regions, list) {
2032                 if (region->type == IOMMU_RESV_DIRECT_RELAXABLE)
2033                         continue;
2034
2035                 if (vfio_find_dma(iommu, region->start, region->length))
2036                         return true;
2037         }
2038
2039         return false;
2040 }
2041
2042 /*
2043  * Check iova region overlap with  reserved regions and
2044  * exclude them from the iommu iova range
2045  */
2046 static int vfio_iommu_resv_exclude(struct list_head *iova,
2047                                    struct list_head *resv_regions)
2048 {
2049         struct iommu_resv_region *resv;
2050         struct vfio_iova *n, *next;
2051
2052         list_for_each_entry(resv, resv_regions, list) {
2053                 phys_addr_t start, end;
2054
2055                 if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
2056                         continue;
2057
2058                 start = resv->start;
2059                 end = resv->start + resv->length - 1;
2060
2061                 list_for_each_entry_safe(n, next, iova, list) {
2062                         int ret = 0;
2063
2064                         /* No overlap */
2065                         if (start > n->end || end < n->start)
2066                                 continue;
2067                         /*
2068                          * Insert a new node if current node overlaps with the
2069                          * reserve region to exclude that from valid iova range.
2070                          * Note that, new node is inserted before the current
2071                          * node and finally the current node is deleted keeping
2072                          * the list updated and sorted.
2073                          */
2074                         if (start > n->start)
2075                                 ret = vfio_iommu_iova_insert(&n->list, n->start,
2076                                                              start - 1);
2077                         if (!ret && end < n->end)
2078                                 ret = vfio_iommu_iova_insert(&n->list, end + 1,
2079                                                              n->end);
2080                         if (ret)
2081                                 return ret;
2082
2083                         list_del(&n->list);
2084                         kfree(n);
2085                 }
2086         }
2087
2088         if (list_empty(iova))
2089                 return -EINVAL;
2090
2091         return 0;
2092 }
2093
2094 static void vfio_iommu_resv_free(struct list_head *resv_regions)
2095 {
2096         struct iommu_resv_region *n, *next;
2097
2098         list_for_each_entry_safe(n, next, resv_regions, list) {
2099                 list_del(&n->list);
2100                 kfree(n);
2101         }
2102 }
2103
2104 static void vfio_iommu_iova_free(struct list_head *iova)
2105 {
2106         struct vfio_iova *n, *next;
2107
2108         list_for_each_entry_safe(n, next, iova, list) {
2109                 list_del(&n->list);
2110                 kfree(n);
2111         }
2112 }
2113
2114 static int vfio_iommu_iova_get_copy(struct vfio_iommu *iommu,
2115                                     struct list_head *iova_copy)
2116 {
2117         struct list_head *iova = &iommu->iova_list;
2118         struct vfio_iova *n;
2119         int ret;
2120
2121         list_for_each_entry(n, iova, list) {
2122                 ret = vfio_iommu_iova_insert(iova_copy, n->start, n->end);
2123                 if (ret)
2124                         goto out_free;
2125         }
2126
2127         return 0;
2128
2129 out_free:
2130         vfio_iommu_iova_free(iova_copy);
2131         return ret;
2132 }
2133
2134 static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu,
2135                                         struct list_head *iova_copy)
2136 {
2137         struct list_head *iova = &iommu->iova_list;
2138
2139         vfio_iommu_iova_free(iova);
2140
2141         list_splice_tail(iova_copy, iova);
2142 }
2143
2144 static int vfio_iommu_domain_alloc(struct device *dev, void *data)
2145 {
2146         struct iommu_domain **domain = data;
2147
2148         *domain = iommu_domain_alloc(dev->bus);
2149         return 1; /* Don't iterate */
2150 }
2151
2152 static int vfio_iommu_type1_attach_group(void *iommu_data,
2153                 struct iommu_group *iommu_group, enum vfio_group_type type)
2154 {
2155         struct vfio_iommu *iommu = iommu_data;
2156         struct vfio_iommu_group *group;
2157         struct vfio_domain *domain, *d;
2158         bool resv_msi;
2159         phys_addr_t resv_msi_base = 0;
2160         struct iommu_domain_geometry *geo;
2161         LIST_HEAD(iova_copy);
2162         LIST_HEAD(group_resv_regions);
2163         int ret = -EBUSY;
2164
2165         mutex_lock(&iommu->lock);
2166
2167         /* Attach could require pinning, so disallow while vaddr is invalid. */
2168         if (iommu->vaddr_invalid_count)
2169                 goto out_unlock;
2170
2171         /* Check for duplicates */
2172         ret = -EINVAL;
2173         if (vfio_iommu_find_iommu_group(iommu, iommu_group))
2174                 goto out_unlock;
2175
2176         ret = -ENOMEM;
2177         group = kzalloc(sizeof(*group), GFP_KERNEL);
2178         if (!group)
2179                 goto out_unlock;
2180         group->iommu_group = iommu_group;
2181
2182         if (type == VFIO_EMULATED_IOMMU) {
2183                 list_add(&group->next, &iommu->emulated_iommu_groups);
2184                 /*
2185                  * An emulated IOMMU group cannot dirty memory directly, it can
2186                  * only use interfaces that provide dirty tracking.
2187                  * The iommu scope can only be promoted with the addition of a
2188                  * dirty tracking group.
2189                  */
2190                 group->pinned_page_dirty_scope = true;
2191                 ret = 0;
2192                 goto out_unlock;
2193         }
2194
2195         ret = -ENOMEM;
2196         domain = kzalloc(sizeof(*domain), GFP_KERNEL);
2197         if (!domain)
2198                 goto out_free_group;
2199
2200         /*
2201          * Going via the iommu_group iterator avoids races, and trivially gives
2202          * us a representative device for the IOMMU API call. We don't actually
2203          * want to iterate beyond the first device (if any).
2204          */
2205         ret = -EIO;
2206         iommu_group_for_each_dev(iommu_group, &domain->domain,
2207                                  vfio_iommu_domain_alloc);
2208         if (!domain->domain)
2209                 goto out_free_domain;
2210
2211         if (iommu->nesting) {
2212                 ret = iommu_enable_nesting(domain->domain);
2213                 if (ret)
2214                         goto out_domain;
2215         }
2216
2217         ret = iommu_attach_group(domain->domain, group->iommu_group);
2218         if (ret)
2219                 goto out_domain;
2220
2221         /* Get aperture info */
2222         geo = &domain->domain->geometry;
2223         if (vfio_iommu_aper_conflict(iommu, geo->aperture_start,
2224                                      geo->aperture_end)) {
2225                 ret = -EINVAL;
2226                 goto out_detach;
2227         }
2228
2229         ret = iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
2230         if (ret)
2231                 goto out_detach;
2232
2233         if (vfio_iommu_resv_conflict(iommu, &group_resv_regions)) {
2234                 ret = -EINVAL;
2235                 goto out_detach;
2236         }
2237
2238         /*
2239          * We don't want to work on the original iova list as the list
2240          * gets modified and in case of failure we have to retain the
2241          * original list. Get a copy here.
2242          */
2243         ret = vfio_iommu_iova_get_copy(iommu, &iova_copy);
2244         if (ret)
2245                 goto out_detach;
2246
2247         ret = vfio_iommu_aper_resize(&iova_copy, geo->aperture_start,
2248                                      geo->aperture_end);
2249         if (ret)
2250                 goto out_detach;
2251
2252         ret = vfio_iommu_resv_exclude(&iova_copy, &group_resv_regions);
2253         if (ret)
2254                 goto out_detach;
2255
2256         resv_msi = vfio_iommu_has_sw_msi(&group_resv_regions, &resv_msi_base);
2257
2258         INIT_LIST_HEAD(&domain->group_list);
2259         list_add(&group->next, &domain->group_list);
2260
2261         if (!allow_unsafe_interrupts &&
2262             !iommu_group_has_isolated_msi(iommu_group)) {
2263                 pr_warn("%s: No interrupt remapping support.  Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
2264                        __func__);
2265                 ret = -EPERM;
2266                 goto out_detach;
2267         }
2268
2269         /*
2270          * If the IOMMU can block non-coherent operations (ie PCIe TLPs with
2271          * no-snoop set) then VFIO always turns this feature on because on Intel
2272          * platforms it optimizes KVM to disable wbinvd emulation.
2273          */
2274         if (domain->domain->ops->enforce_cache_coherency)
2275                 domain->enforce_cache_coherency =
2276                         domain->domain->ops->enforce_cache_coherency(
2277                                 domain->domain);
2278
2279         /*
2280          * Try to match an existing compatible domain.  We don't want to
2281          * preclude an IOMMU driver supporting multiple bus_types and being
2282          * able to include different bus_types in the same IOMMU domain, so
2283          * we test whether the domains use the same iommu_ops rather than
2284          * testing if they're on the same bus_type.
2285          */
2286         list_for_each_entry(d, &iommu->domain_list, next) {
2287                 if (d->domain->ops == domain->domain->ops &&
2288                     d->enforce_cache_coherency ==
2289                             domain->enforce_cache_coherency) {
2290                         iommu_detach_group(domain->domain, group->iommu_group);
2291                         if (!iommu_attach_group(d->domain,
2292                                                 group->iommu_group)) {
2293                                 list_add(&group->next, &d->group_list);
2294                                 iommu_domain_free(domain->domain);
2295                                 kfree(domain);
2296                                 goto done;
2297                         }
2298
2299                         ret = iommu_attach_group(domain->domain,
2300                                                  group->iommu_group);
2301                         if (ret)
2302                                 goto out_domain;
2303                 }
2304         }
2305
2306         vfio_test_domain_fgsp(domain, &iova_copy);
2307
2308         /* replay mappings on new domains */
2309         ret = vfio_iommu_replay(iommu, domain);
2310         if (ret)
2311                 goto out_detach;
2312
2313         if (resv_msi) {
2314                 ret = iommu_get_msi_cookie(domain->domain, resv_msi_base);
2315                 if (ret && ret != -ENODEV)
2316                         goto out_detach;
2317         }
2318
2319         list_add(&domain->next, &iommu->domain_list);
2320         vfio_update_pgsize_bitmap(iommu);
2321 done:
2322         /* Delete the old one and insert new iova list */
2323         vfio_iommu_iova_insert_copy(iommu, &iova_copy);
2324
2325         /*
2326          * An iommu backed group can dirty memory directly and therefore
2327          * demotes the iommu scope until it declares itself dirty tracking
2328          * capable via the page pinning interface.
2329          */
2330         iommu->num_non_pinned_groups++;
2331         mutex_unlock(&iommu->lock);
2332         vfio_iommu_resv_free(&group_resv_regions);
2333
2334         return 0;
2335
2336 out_detach:
2337         iommu_detach_group(domain->domain, group->iommu_group);
2338 out_domain:
2339         iommu_domain_free(domain->domain);
2340         vfio_iommu_iova_free(&iova_copy);
2341         vfio_iommu_resv_free(&group_resv_regions);
2342 out_free_domain:
2343         kfree(domain);
2344 out_free_group:
2345         kfree(group);
2346 out_unlock:
2347         mutex_unlock(&iommu->lock);
2348         return ret;
2349 }
2350
2351 static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
2352 {
2353         struct rb_node *node;
2354
2355         while ((node = rb_first(&iommu->dma_list)))
2356                 vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
2357 }
2358
2359 static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
2360 {
2361         struct rb_node *n, *p;
2362
2363         n = rb_first(&iommu->dma_list);
2364         for (; n; n = rb_next(n)) {
2365                 struct vfio_dma *dma;
2366                 long locked = 0, unlocked = 0;
2367
2368                 dma = rb_entry(n, struct vfio_dma, node);
2369                 unlocked += vfio_unmap_unpin(iommu, dma, false);
2370                 p = rb_first(&dma->pfn_list);
2371                 for (; p; p = rb_next(p)) {
2372                         struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
2373                                                          node);
2374
2375                         if (!is_invalid_reserved_pfn(vpfn->pfn))
2376                                 locked++;
2377                 }
2378                 vfio_lock_acct(dma, locked - unlocked, true);
2379         }
2380 }
2381
2382 /*
2383  * Called when a domain is removed in detach. It is possible that
2384  * the removed domain decided the iova aperture window. Modify the
2385  * iova aperture with the smallest window among existing domains.
2386  */
2387 static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
2388                                    struct list_head *iova_copy)
2389 {
2390         struct vfio_domain *domain;
2391         struct vfio_iova *node;
2392         dma_addr_t start = 0;
2393         dma_addr_t end = (dma_addr_t)~0;
2394
2395         if (list_empty(iova_copy))
2396                 return;
2397
2398         list_for_each_entry(domain, &iommu->domain_list, next) {
2399                 struct iommu_domain_geometry *geo = &domain->domain->geometry;
2400
2401                 if (geo->aperture_start > start)
2402                         start = geo->aperture_start;
2403                 if (geo->aperture_end < end)
2404                         end = geo->aperture_end;
2405         }
2406
2407         /* Modify aperture limits. The new aper is either same or bigger */
2408         node = list_first_entry(iova_copy, struct vfio_iova, list);
2409         node->start = start;
2410         node = list_last_entry(iova_copy, struct vfio_iova, list);
2411         node->end = end;
2412 }
2413
2414 /*
2415  * Called when a group is detached. The reserved regions for that
2416  * group can be part of valid iova now. But since reserved regions
2417  * may be duplicated among groups, populate the iova valid regions
2418  * list again.
2419  */
2420 static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
2421                                    struct list_head *iova_copy)
2422 {
2423         struct vfio_domain *d;
2424         struct vfio_iommu_group *g;
2425         struct vfio_iova *node;
2426         dma_addr_t start, end;
2427         LIST_HEAD(resv_regions);
2428         int ret;
2429
2430         if (list_empty(iova_copy))
2431                 return -EINVAL;
2432
2433         list_for_each_entry(d, &iommu->domain_list, next) {
2434                 list_for_each_entry(g, &d->group_list, next) {
2435                         ret = iommu_get_group_resv_regions(g->iommu_group,
2436                                                            &resv_regions);
2437                         if (ret)
2438                                 goto done;
2439                 }
2440         }
2441
2442         node = list_first_entry(iova_copy, struct vfio_iova, list);
2443         start = node->start;
2444         node = list_last_entry(iova_copy, struct vfio_iova, list);
2445         end = node->end;
2446
2447         /* purge the iova list and create new one */
2448         vfio_iommu_iova_free(iova_copy);
2449
2450         ret = vfio_iommu_aper_resize(iova_copy, start, end);
2451         if (ret)
2452                 goto done;
2453
2454         /* Exclude current reserved regions from iova ranges */
2455         ret = vfio_iommu_resv_exclude(iova_copy, &resv_regions);
2456 done:
2457         vfio_iommu_resv_free(&resv_regions);
2458         return ret;
2459 }
2460
2461 static void vfio_iommu_type1_detach_group(void *iommu_data,
2462                                           struct iommu_group *iommu_group)
2463 {
2464         struct vfio_iommu *iommu = iommu_data;
2465         struct vfio_domain *domain;
2466         struct vfio_iommu_group *group;
2467         bool update_dirty_scope = false;
2468         LIST_HEAD(iova_copy);
2469
2470         mutex_lock(&iommu->lock);
2471         list_for_each_entry(group, &iommu->emulated_iommu_groups, next) {
2472                 if (group->iommu_group != iommu_group)
2473                         continue;
2474                 update_dirty_scope = !group->pinned_page_dirty_scope;
2475                 list_del(&group->next);
2476                 kfree(group);
2477
2478                 if (list_empty(&iommu->emulated_iommu_groups) &&
2479                     list_empty(&iommu->domain_list)) {
2480                         WARN_ON(!list_empty(&iommu->device_list));
2481                         vfio_iommu_unmap_unpin_all(iommu);
2482                 }
2483                 goto detach_group_done;
2484         }
2485
2486         /*
2487          * Get a copy of iova list. This will be used to update
2488          * and to replace the current one later. Please note that
2489          * we will leave the original list as it is if update fails.
2490          */
2491         vfio_iommu_iova_get_copy(iommu, &iova_copy);
2492
2493         list_for_each_entry(domain, &iommu->domain_list, next) {
2494                 group = find_iommu_group(domain, iommu_group);
2495                 if (!group)
2496                         continue;
2497
2498                 iommu_detach_group(domain->domain, group->iommu_group);
2499                 update_dirty_scope = !group->pinned_page_dirty_scope;
2500                 list_del(&group->next);
2501                 kfree(group);
2502                 /*
2503                  * Group ownership provides privilege, if the group list is
2504                  * empty, the domain goes away. If it's the last domain with
2505                  * iommu and external domain doesn't exist, then all the
2506                  * mappings go away too. If it's the last domain with iommu and
2507                  * external domain exist, update accounting
2508                  */
2509                 if (list_empty(&domain->group_list)) {
2510                         if (list_is_singular(&iommu->domain_list)) {
2511                                 if (list_empty(&iommu->emulated_iommu_groups)) {
2512                                         WARN_ON(!list_empty(
2513                                                 &iommu->device_list));
2514                                         vfio_iommu_unmap_unpin_all(iommu);
2515                                 } else {
2516                                         vfio_iommu_unmap_unpin_reaccount(iommu);
2517                                 }
2518                         }
2519                         iommu_domain_free(domain->domain);
2520                         list_del(&domain->next);
2521                         kfree(domain);
2522                         vfio_iommu_aper_expand(iommu, &iova_copy);
2523                         vfio_update_pgsize_bitmap(iommu);
2524                 }
2525                 break;
2526         }
2527
2528         if (!vfio_iommu_resv_refresh(iommu, &iova_copy))
2529                 vfio_iommu_iova_insert_copy(iommu, &iova_copy);
2530         else
2531                 vfio_iommu_iova_free(&iova_copy);
2532
2533 detach_group_done:
2534         /*
2535          * Removal of a group without dirty tracking may allow the iommu scope
2536          * to be promoted.
2537          */
2538         if (update_dirty_scope) {
2539                 iommu->num_non_pinned_groups--;
2540                 if (iommu->dirty_page_tracking)
2541                         vfio_iommu_populate_bitmap_full(iommu);
2542         }
2543         mutex_unlock(&iommu->lock);
2544 }
2545
2546 static void *vfio_iommu_type1_open(unsigned long arg)
2547 {
2548         struct vfio_iommu *iommu;
2549
2550         iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
2551         if (!iommu)
2552                 return ERR_PTR(-ENOMEM);
2553
2554         switch (arg) {
2555         case VFIO_TYPE1_IOMMU:
2556                 break;
2557         case VFIO_TYPE1_NESTING_IOMMU:
2558                 iommu->nesting = true;
2559                 fallthrough;
2560         case VFIO_TYPE1v2_IOMMU:
2561                 iommu->v2 = true;
2562                 break;
2563         default:
2564                 kfree(iommu);
2565                 return ERR_PTR(-EINVAL);
2566         }
2567
2568         INIT_LIST_HEAD(&iommu->domain_list);
2569         INIT_LIST_HEAD(&iommu->iova_list);
2570         iommu->dma_list = RB_ROOT;
2571         iommu->dma_avail = dma_entry_limit;
2572         mutex_init(&iommu->lock);
2573         mutex_init(&iommu->device_list_lock);
2574         INIT_LIST_HEAD(&iommu->device_list);
2575         iommu->pgsize_bitmap = PAGE_MASK;
2576         INIT_LIST_HEAD(&iommu->emulated_iommu_groups);
2577
2578         return iommu;
2579 }
2580
2581 static void vfio_release_domain(struct vfio_domain *domain)
2582 {
2583         struct vfio_iommu_group *group, *group_tmp;
2584
2585         list_for_each_entry_safe(group, group_tmp,
2586                                  &domain->group_list, next) {
2587                 iommu_detach_group(domain->domain, group->iommu_group);
2588                 list_del(&group->next);
2589                 kfree(group);
2590         }
2591
2592         iommu_domain_free(domain->domain);
2593 }
2594
2595 static void vfio_iommu_type1_release(void *iommu_data)
2596 {
2597         struct vfio_iommu *iommu = iommu_data;
2598         struct vfio_domain *domain, *domain_tmp;
2599         struct vfio_iommu_group *group, *next_group;
2600
2601         list_for_each_entry_safe(group, next_group,
2602                         &iommu->emulated_iommu_groups, next) {
2603                 list_del(&group->next);
2604                 kfree(group);
2605         }
2606
2607         vfio_iommu_unmap_unpin_all(iommu);
2608
2609         list_for_each_entry_safe(domain, domain_tmp,
2610                                  &iommu->domain_list, next) {
2611                 vfio_release_domain(domain);
2612                 list_del(&domain->next);
2613                 kfree(domain);
2614         }
2615
2616         vfio_iommu_iova_free(&iommu->iova_list);
2617
2618         kfree(iommu);
2619 }
2620
2621 static int vfio_domains_have_enforce_cache_coherency(struct vfio_iommu *iommu)
2622 {
2623         struct vfio_domain *domain;
2624         int ret = 1;
2625
2626         mutex_lock(&iommu->lock);
2627         list_for_each_entry(domain, &iommu->domain_list, next) {
2628                 if (!(domain->enforce_cache_coherency)) {
2629                         ret = 0;
2630                         break;
2631                 }
2632         }
2633         mutex_unlock(&iommu->lock);
2634
2635         return ret;
2636 }
2637
2638 static bool vfio_iommu_has_emulated(struct vfio_iommu *iommu)
2639 {
2640         bool ret;
2641
2642         mutex_lock(&iommu->lock);
2643         ret = !list_empty(&iommu->emulated_iommu_groups);
2644         mutex_unlock(&iommu->lock);
2645         return ret;
2646 }
2647
2648 static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
2649                                             unsigned long arg)
2650 {
2651         switch (arg) {
2652         case VFIO_TYPE1_IOMMU:
2653         case VFIO_TYPE1v2_IOMMU:
2654         case VFIO_TYPE1_NESTING_IOMMU:
2655         case VFIO_UNMAP_ALL:
2656                 return 1;
2657         case VFIO_UPDATE_VADDR:
2658                 /*
2659                  * Disable this feature if mdevs are present.  They cannot
2660                  * safely pin/unpin/rw while vaddrs are being updated.
2661                  */
2662                 return iommu && !vfio_iommu_has_emulated(iommu);
2663         case VFIO_DMA_CC_IOMMU:
2664                 if (!iommu)
2665                         return 0;
2666                 return vfio_domains_have_enforce_cache_coherency(iommu);
2667         default:
2668                 return 0;
2669         }
2670 }
2671
2672 static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
2673                  struct vfio_iommu_type1_info_cap_iova_range *cap_iovas,
2674                  size_t size)
2675 {
2676         struct vfio_info_cap_header *header;
2677         struct vfio_iommu_type1_info_cap_iova_range *iova_cap;
2678
2679         header = vfio_info_cap_add(caps, size,
2680                                    VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
2681         if (IS_ERR(header))
2682                 return PTR_ERR(header);
2683
2684         iova_cap = container_of(header,
2685                                 struct vfio_iommu_type1_info_cap_iova_range,
2686                                 header);
2687         iova_cap->nr_iovas = cap_iovas->nr_iovas;
2688         memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges,
2689                cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges));
2690         return 0;
2691 }
2692
2693 static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
2694                                       struct vfio_info_cap *caps)
2695 {
2696         struct vfio_iommu_type1_info_cap_iova_range *cap_iovas;
2697         struct vfio_iova *iova;
2698         size_t size;
2699         int iovas = 0, i = 0, ret;
2700
2701         list_for_each_entry(iova, &iommu->iova_list, list)
2702                 iovas++;
2703
2704         if (!iovas) {
2705                 /*
2706                  * Return 0 as a container with a single mdev device
2707                  * will have an empty list
2708                  */
2709                 return 0;
2710         }
2711
2712         size = struct_size(cap_iovas, iova_ranges, iovas);
2713
2714         cap_iovas = kzalloc(size, GFP_KERNEL);
2715         if (!cap_iovas)
2716                 return -ENOMEM;
2717
2718         cap_iovas->nr_iovas = iovas;
2719
2720         list_for_each_entry(iova, &iommu->iova_list, list) {
2721                 cap_iovas->iova_ranges[i].start = iova->start;
2722                 cap_iovas->iova_ranges[i].end = iova->end;
2723                 i++;
2724         }
2725
2726         ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size);
2727
2728         kfree(cap_iovas);
2729         return ret;
2730 }
2731
2732 static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
2733                                            struct vfio_info_cap *caps)
2734 {
2735         struct vfio_iommu_type1_info_cap_migration cap_mig = {};
2736
2737         cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION;
2738         cap_mig.header.version = 1;
2739
2740         cap_mig.flags = 0;
2741         /* support minimum pgsize */
2742         cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap);
2743         cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX;
2744
2745         return vfio_info_add_capability(caps, &cap_mig.header, sizeof(cap_mig));
2746 }
2747
2748 static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu,
2749                                            struct vfio_info_cap *caps)
2750 {
2751         struct vfio_iommu_type1_info_dma_avail cap_dma_avail;
2752
2753         cap_dma_avail.header.id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL;
2754         cap_dma_avail.header.version = 1;
2755
2756         cap_dma_avail.avail = iommu->dma_avail;
2757
2758         return vfio_info_add_capability(caps, &cap_dma_avail.header,
2759                                         sizeof(cap_dma_avail));
2760 }
2761
2762 static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu,
2763                                      unsigned long arg)
2764 {
2765         struct vfio_iommu_type1_info info = {};
2766         unsigned long minsz;
2767         struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
2768         int ret;
2769
2770         minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
2771
2772         if (copy_from_user(&info, (void __user *)arg, minsz))
2773                 return -EFAULT;
2774
2775         if (info.argsz < minsz)
2776                 return -EINVAL;
2777
2778         minsz = min_t(size_t, info.argsz, sizeof(info));
2779
2780         mutex_lock(&iommu->lock);
2781         info.flags = VFIO_IOMMU_INFO_PGSIZES;
2782
2783         info.iova_pgsizes = iommu->pgsize_bitmap;
2784
2785         ret = vfio_iommu_migration_build_caps(iommu, &caps);
2786
2787         if (!ret)
2788                 ret = vfio_iommu_dma_avail_build_caps(iommu, &caps);
2789
2790         if (!ret)
2791                 ret = vfio_iommu_iova_build_caps(iommu, &caps);
2792
2793         mutex_unlock(&iommu->lock);
2794
2795         if (ret)
2796                 return ret;
2797
2798         if (caps.size) {
2799                 info.flags |= VFIO_IOMMU_INFO_CAPS;
2800
2801                 if (info.argsz < sizeof(info) + caps.size) {
2802                         info.argsz = sizeof(info) + caps.size;
2803                 } else {
2804                         vfio_info_cap_shift(&caps, sizeof(info));
2805                         if (copy_to_user((void __user *)arg +
2806                                         sizeof(info), caps.buf,
2807                                         caps.size)) {
2808                                 kfree(caps.buf);
2809                                 return -EFAULT;
2810                         }
2811                         info.cap_offset = sizeof(info);
2812                 }
2813
2814                 kfree(caps.buf);
2815         }
2816
2817         return copy_to_user((void __user *)arg, &info, minsz) ?
2818                         -EFAULT : 0;
2819 }
2820
2821 static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu,
2822                                     unsigned long arg)
2823 {
2824         struct vfio_iommu_type1_dma_map map;
2825         unsigned long minsz;
2826         uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE |
2827                         VFIO_DMA_MAP_FLAG_VADDR;
2828
2829         minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
2830
2831         if (copy_from_user(&map, (void __user *)arg, minsz))
2832                 return -EFAULT;
2833
2834         if (map.argsz < minsz || map.flags & ~mask)
2835                 return -EINVAL;
2836
2837         return vfio_dma_do_map(iommu, &map);
2838 }
2839
2840 static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
2841                                       unsigned long arg)
2842 {
2843         struct vfio_iommu_type1_dma_unmap unmap;
2844         struct vfio_bitmap bitmap = { 0 };
2845         uint32_t mask = VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP |
2846                         VFIO_DMA_UNMAP_FLAG_VADDR |
2847                         VFIO_DMA_UNMAP_FLAG_ALL;
2848         unsigned long minsz;
2849         int ret;
2850
2851         minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
2852
2853         if (copy_from_user(&unmap, (void __user *)arg, minsz))
2854                 return -EFAULT;
2855
2856         if (unmap.argsz < minsz || unmap.flags & ~mask)
2857                 return -EINVAL;
2858
2859         if ((unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
2860             (unmap.flags & (VFIO_DMA_UNMAP_FLAG_ALL |
2861                             VFIO_DMA_UNMAP_FLAG_VADDR)))
2862                 return -EINVAL;
2863
2864         if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
2865                 unsigned long pgshift;
2866
2867                 if (unmap.argsz < (minsz + sizeof(bitmap)))
2868                         return -EINVAL;
2869
2870                 if (copy_from_user(&bitmap,
2871                                    (void __user *)(arg + minsz),
2872                                    sizeof(bitmap)))
2873                         return -EFAULT;
2874
2875                 if (!access_ok((void __user *)bitmap.data, bitmap.size))
2876                         return -EINVAL;
2877
2878                 pgshift = __ffs(bitmap.pgsize);
2879                 ret = verify_bitmap_size(unmap.size >> pgshift,
2880                                          bitmap.size);
2881                 if (ret)
2882                         return ret;
2883         }
2884
2885         ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap);
2886         if (ret)
2887                 return ret;
2888
2889         return copy_to_user((void __user *)arg, &unmap, minsz) ?
2890                         -EFAULT : 0;
2891 }
2892
2893 static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
2894                                         unsigned long arg)
2895 {
2896         struct vfio_iommu_type1_dirty_bitmap dirty;
2897         uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
2898                         VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
2899                         VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
2900         unsigned long minsz;
2901         int ret = 0;
2902
2903         if (!iommu->v2)
2904                 return -EACCES;
2905
2906         minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap, flags);
2907
2908         if (copy_from_user(&dirty, (void __user *)arg, minsz))
2909                 return -EFAULT;
2910
2911         if (dirty.argsz < minsz || dirty.flags & ~mask)
2912                 return -EINVAL;
2913
2914         /* only one flag should be set at a time */
2915         if (__ffs(dirty.flags) != __fls(dirty.flags))
2916                 return -EINVAL;
2917
2918         if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
2919                 size_t pgsize;
2920
2921                 mutex_lock(&iommu->lock);
2922                 pgsize = 1 << __ffs(iommu->pgsize_bitmap);
2923                 if (!iommu->dirty_page_tracking) {
2924                         ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
2925                         if (!ret)
2926                                 iommu->dirty_page_tracking = true;
2927                 }
2928                 mutex_unlock(&iommu->lock);
2929                 return ret;
2930         } else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
2931                 mutex_lock(&iommu->lock);
2932                 if (iommu->dirty_page_tracking) {
2933                         iommu->dirty_page_tracking = false;
2934                         vfio_dma_bitmap_free_all(iommu);
2935                 }
2936                 mutex_unlock(&iommu->lock);
2937                 return 0;
2938         } else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
2939                 struct vfio_iommu_type1_dirty_bitmap_get range;
2940                 unsigned long pgshift;
2941                 size_t data_size = dirty.argsz - minsz;
2942                 size_t iommu_pgsize;
2943
2944                 if (!data_size || data_size < sizeof(range))
2945                         return -EINVAL;
2946
2947                 if (copy_from_user(&range, (void __user *)(arg + minsz),
2948                                    sizeof(range)))
2949                         return -EFAULT;
2950
2951                 if (range.iova + range.size < range.iova)
2952                         return -EINVAL;
2953                 if (!access_ok((void __user *)range.bitmap.data,
2954                                range.bitmap.size))
2955                         return -EINVAL;
2956
2957                 pgshift = __ffs(range.bitmap.pgsize);
2958                 ret = verify_bitmap_size(range.size >> pgshift,
2959                                          range.bitmap.size);
2960                 if (ret)
2961                         return ret;
2962
2963                 mutex_lock(&iommu->lock);
2964
2965                 iommu_pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
2966
2967                 /* allow only smallest supported pgsize */
2968                 if (range.bitmap.pgsize != iommu_pgsize) {
2969                         ret = -EINVAL;
2970                         goto out_unlock;
2971                 }
2972                 if (range.iova & (iommu_pgsize - 1)) {
2973                         ret = -EINVAL;
2974                         goto out_unlock;
2975                 }
2976                 if (!range.size || range.size & (iommu_pgsize - 1)) {
2977                         ret = -EINVAL;
2978                         goto out_unlock;
2979                 }
2980
2981                 if (iommu->dirty_page_tracking)
2982                         ret = vfio_iova_dirty_bitmap(range.bitmap.data,
2983                                                      iommu, range.iova,
2984                                                      range.size,
2985                                                      range.bitmap.pgsize);
2986                 else
2987                         ret = -EINVAL;
2988 out_unlock:
2989                 mutex_unlock(&iommu->lock);
2990
2991                 return ret;
2992         }
2993
2994         return -EINVAL;
2995 }
2996
2997 static long vfio_iommu_type1_ioctl(void *iommu_data,
2998                                    unsigned int cmd, unsigned long arg)
2999 {
3000         struct vfio_iommu *iommu = iommu_data;
3001
3002         switch (cmd) {
3003         case VFIO_CHECK_EXTENSION:
3004                 return vfio_iommu_type1_check_extension(iommu, arg);
3005         case VFIO_IOMMU_GET_INFO:
3006                 return vfio_iommu_type1_get_info(iommu, arg);
3007         case VFIO_IOMMU_MAP_DMA:
3008                 return vfio_iommu_type1_map_dma(iommu, arg);
3009         case VFIO_IOMMU_UNMAP_DMA:
3010                 return vfio_iommu_type1_unmap_dma(iommu, arg);
3011         case VFIO_IOMMU_DIRTY_PAGES:
3012                 return vfio_iommu_type1_dirty_pages(iommu, arg);
3013         default:
3014                 return -ENOTTY;
3015         }
3016 }
3017
3018 static void vfio_iommu_type1_register_device(void *iommu_data,
3019                                              struct vfio_device *vdev)
3020 {
3021         struct vfio_iommu *iommu = iommu_data;
3022
3023         if (!vdev->ops->dma_unmap)
3024                 return;
3025
3026         /*
3027          * list_empty(&iommu->device_list) is tested under the iommu->lock while
3028          * iteration for dma_unmap must be done under the device_list_lock.
3029          * Holding both locks here allows avoiding the device_list_lock in
3030          * several fast paths. See vfio_notify_dma_unmap()
3031          */
3032         mutex_lock(&iommu->lock);
3033         mutex_lock(&iommu->device_list_lock);
3034         list_add(&vdev->iommu_entry, &iommu->device_list);
3035         mutex_unlock(&iommu->device_list_lock);
3036         mutex_unlock(&iommu->lock);
3037 }
3038
3039 static void vfio_iommu_type1_unregister_device(void *iommu_data,
3040                                                struct vfio_device *vdev)
3041 {
3042         struct vfio_iommu *iommu = iommu_data;
3043
3044         if (!vdev->ops->dma_unmap)
3045                 return;
3046
3047         mutex_lock(&iommu->lock);
3048         mutex_lock(&iommu->device_list_lock);
3049         list_del(&vdev->iommu_entry);
3050         mutex_unlock(&iommu->device_list_lock);
3051         mutex_unlock(&iommu->lock);
3052 }
3053
3054 static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
3055                                          dma_addr_t user_iova, void *data,
3056                                          size_t count, bool write,
3057                                          size_t *copied)
3058 {
3059         struct mm_struct *mm;
3060         unsigned long vaddr;
3061         struct vfio_dma *dma;
3062         bool kthread = current->mm == NULL;
3063         size_t offset;
3064
3065         *copied = 0;
3066
3067         dma = vfio_find_dma(iommu, user_iova, 1);
3068         if (!dma)
3069                 return -EINVAL;
3070
3071         if ((write && !(dma->prot & IOMMU_WRITE)) ||
3072                         !(dma->prot & IOMMU_READ))
3073                 return -EPERM;
3074
3075         mm = dma->mm;
3076         if (!mmget_not_zero(mm))
3077                 return -EPERM;
3078
3079         if (kthread)
3080                 kthread_use_mm(mm);
3081         else if (current->mm != mm)
3082                 goto out;
3083
3084         offset = user_iova - dma->iova;
3085
3086         if (count > dma->size - offset)
3087                 count = dma->size - offset;
3088
3089         vaddr = dma->vaddr + offset;
3090
3091         if (write) {
3092                 *copied = copy_to_user((void __user *)vaddr, data,
3093                                          count) ? 0 : count;
3094                 if (*copied && iommu->dirty_page_tracking) {
3095                         unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
3096                         /*
3097                          * Bitmap populated with the smallest supported page
3098                          * size
3099                          */
3100                         bitmap_set(dma->bitmap, offset >> pgshift,
3101                                    ((offset + *copied - 1) >> pgshift) -
3102                                    (offset >> pgshift) + 1);
3103                 }
3104         } else
3105                 *copied = copy_from_user(data, (void __user *)vaddr,
3106                                            count) ? 0 : count;
3107         if (kthread)
3108                 kthread_unuse_mm(mm);
3109 out:
3110         mmput(mm);
3111         return *copied ? 0 : -EFAULT;
3112 }
3113
3114 static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova,
3115                                    void *data, size_t count, bool write)
3116 {
3117         struct vfio_iommu *iommu = iommu_data;
3118         int ret = 0;
3119         size_t done;
3120
3121         mutex_lock(&iommu->lock);
3122
3123         if (WARN_ONCE(iommu->vaddr_invalid_count,
3124                       "vfio_dma_rw not allowed with VFIO_UPDATE_VADDR\n")) {
3125                 ret = -EBUSY;
3126                 goto out;
3127         }
3128
3129         while (count > 0) {
3130                 ret = vfio_iommu_type1_dma_rw_chunk(iommu, user_iova, data,
3131                                                     count, write, &done);
3132                 if (ret)
3133                         break;
3134
3135                 count -= done;
3136                 data += done;
3137                 user_iova += done;
3138         }
3139
3140 out:
3141         mutex_unlock(&iommu->lock);
3142         return ret;
3143 }
3144
3145 static struct iommu_domain *
3146 vfio_iommu_type1_group_iommu_domain(void *iommu_data,
3147                                     struct iommu_group *iommu_group)
3148 {
3149         struct iommu_domain *domain = ERR_PTR(-ENODEV);
3150         struct vfio_iommu *iommu = iommu_data;
3151         struct vfio_domain *d;
3152
3153         if (!iommu || !iommu_group)
3154                 return ERR_PTR(-EINVAL);
3155
3156         mutex_lock(&iommu->lock);
3157         list_for_each_entry(d, &iommu->domain_list, next) {
3158                 if (find_iommu_group(d, iommu_group)) {
3159                         domain = d->domain;
3160                         break;
3161                 }
3162         }
3163         mutex_unlock(&iommu->lock);
3164
3165         return domain;
3166 }
3167
3168 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
3169         .name                   = "vfio-iommu-type1",
3170         .owner                  = THIS_MODULE,
3171         .open                   = vfio_iommu_type1_open,
3172         .release                = vfio_iommu_type1_release,
3173         .ioctl                  = vfio_iommu_type1_ioctl,
3174         .attach_group           = vfio_iommu_type1_attach_group,
3175         .detach_group           = vfio_iommu_type1_detach_group,
3176         .pin_pages              = vfio_iommu_type1_pin_pages,
3177         .unpin_pages            = vfio_iommu_type1_unpin_pages,
3178         .register_device        = vfio_iommu_type1_register_device,
3179         .unregister_device      = vfio_iommu_type1_unregister_device,
3180         .dma_rw                 = vfio_iommu_type1_dma_rw,
3181         .group_iommu_domain     = vfio_iommu_type1_group_iommu_domain,
3182 };
3183
3184 static int __init vfio_iommu_type1_init(void)
3185 {
3186         return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
3187 }
3188
3189 static void __exit vfio_iommu_type1_cleanup(void)
3190 {
3191         vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
3192 }
3193
3194 module_init(vfio_iommu_type1_init);
3195 module_exit(vfio_iommu_type1_cleanup);
3196
3197 MODULE_VERSION(DRIVER_VERSION);
3198 MODULE_LICENSE("GPL v2");
3199 MODULE_AUTHOR(DRIVER_AUTHOR);
3200 MODULE_DESCRIPTION(DRIVER_DESC);