drivers/iommu/intel-iommu.c

   1 /*
   2  * Copyright © 2006-2014 Intel Corporation.
   3  *
   4  * This program is free software; you can redistribute it and/or modify it
   5  * under the terms and conditions of the GNU General Public License,
   6  * version 2, as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope it will be useful, but WITHOUT
   9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11  * more details.
  12  *
  13  * Authors: David Woodhouse <dwmw2@infradead.org>,
  14  *          Ashok Raj <ashok.raj@intel.com>,
  15  *          Shaohua Li <shaohua.li@intel.com>,
  16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
  17  *          Fenghua Yu <fenghua.yu@intel.com>
  18  *          Joerg Roedel <jroedel@suse.de>
  19  */
  20
  21 #define pr_fmt(fmt)     "DMAR: " fmt
  22
  23 #include <linux/init.h>
  24 #include <linux/bitmap.h>
  25 #include <linux/debugfs.h>
  26 #include <linux/export.h>
  27 #include <linux/slab.h>
  28 #include <linux/irq.h>
  29 #include <linux/interrupt.h>
  30 #include <linux/spinlock.h>
  31 #include <linux/pci.h>
  32 #include <linux/dmar.h>
  33 #include <linux/dma-mapping.h>
  34 #include <linux/mempool.h>
  35 #include <linux/memory.h>
  36 #include <linux/cpu.h>
  37 #include <linux/timer.h>
  38 #include <linux/io.h>
  39 #include <linux/iova.h>
  40 #include <linux/iommu.h>
  41 #include <linux/intel-iommu.h>
  42 #include <linux/syscore_ops.h>
  43 #include <linux/tboot.h>
  44 #include <linux/dmi.h>
  45 #include <linux/pci-ats.h>
  46 #include <linux/memblock.h>
  47 #include <linux/dma-contiguous.h>
  48 #include <linux/crash_dump.h>
  49 #include <asm/irq_remapping.h>
  50 #include <asm/cacheflush.h>
  51 #include <asm/iommu.h>
  52
  53 #include "irq_remapping.h"
  54
  55 #define ROOT_SIZE               VTD_PAGE_SIZE
  56 #define CONTEXT_SIZE            VTD_PAGE_SIZE
  57
  58 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  59 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  60 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  61 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  62
  63 #define IOAPIC_RANGE_START      (0xfee00000)
  64 #define IOAPIC_RANGE_END        (0xfeefffff)
  65 #define IOVA_START_ADDR         (0x1000)
  66
  67 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
  68
  69 #define MAX_AGAW_WIDTH 64
  70 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  71
  72 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  73 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  74
  75 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  76    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  77 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  78                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  79 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  80
  81 /* IO virtual address start page frame number */
  82 #define IOVA_START_PFN          (1)
  83
  84 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  85 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
  86 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
  87
  88 /* page table handling */
  89 #define LEVEL_STRIDE            (9)
  90 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  91
  92 /*
  93  * This bitmap is used to advertise the page sizes our hardware support
  94  * to the IOMMU core, which will then use this information to split
  95  * physically contiguous memory regions it is mapping into page sizes
  96  * that we support.
  97  *
  98  * Traditionally the IOMMU core just handed us the mappings directly,
  99  * after making sure the size is an order of a 4KiB page and that the
 100  * mapping has natural alignment.
 101  *
 102  * To retain this behavior, we currently advertise that we support
 103  * all page sizes that are an order of 4KiB.
 104  *
 105  * If at some point we'd like to utilize the IOMMU core's new behavior,
 106  * we could change this to advertise the real page sizes we support.
 107  */
 108 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
 109
 110 static inline int agaw_to_level(int agaw)
 111 {
 112         return agaw + 2;
 113 }
 114
 115 static inline int agaw_to_width(int agaw)
 116 {
 117         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
 118 }
 119
 120 static inline int width_to_agaw(int width)
 121 {
 122         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 123 }
 124
 125 static inline unsigned int level_to_offset_bits(int level)
 126 {
 127         return (level - 1) * LEVEL_STRIDE;
 128 }
 129
 130 static inline int pfn_level_offset(unsigned long pfn, int level)
 131 {
 132         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 133 }
 134
 135 static inline unsigned long level_mask(int level)
 136 {
 137         return -1UL << level_to_offset_bits(level);
 138 }
 139
 140 static inline unsigned long level_size(int level)
 141 {
 142         return 1UL << level_to_offset_bits(level);
 143 }
 144
 145 static inline unsigned long align_to_level(unsigned long pfn, int level)
 146 {
 147         return (pfn + level_size(level) - 1) & level_mask(level);
 148 }
 149
 150 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 151 {
 152         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 153 }
 154
 155 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 156    are never going to work. */
 157 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 158 {
 159         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 160 }
 161
 162 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 163 {
 164         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 165 }
 166 static inline unsigned long page_to_dma_pfn(struct page *pg)
 167 {
 168         return mm_to_dma_pfn(page_to_pfn(pg));
 169 }
 170 static inline unsigned long virt_to_dma_pfn(void *p)
 171 {
 172         return page_to_dma_pfn(virt_to_page(p));
 173 }
 174
 175 /* global iommu list, set NULL for ignored DMAR units */
 176 static struct intel_iommu **g_iommus;
 177
 178 static void __init check_tylersburg_isoch(void);
 179 static int rwbf_quirk;
 180
 181 /*
 182  * set to 1 to panic kernel if can't successfully enable VT-d
 183  * (used when kernel is launched w/ TXT)
 184  */
 185 static int force_on = 0;
 186 int intel_iommu_tboot_noforce;
 187
 188 /*
 189  * 0: Present
 190  * 1-11: Reserved
 191  * 12-63: Context Ptr (12 - (haw-1))
 192  * 64-127: Reserved
 193  */
 194 struct root_entry {
 195         u64     lo;
 196         u64     hi;
 197 };
 198 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 199
 200 /*
 201  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 202  * if marked present.
 203  */
 204 static phys_addr_t root_entry_lctp(struct root_entry *re)
 205 {
 206         if (!(re->lo & 1))
 207                 return 0;
 208
 209         return re->lo & VTD_PAGE_MASK;
 210 }
 211
 212 /*
 213  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 214  * if marked present.
 215  */
 216 static phys_addr_t root_entry_uctp(struct root_entry *re)
 217 {
 218         if (!(re->hi & 1))
 219                 return 0;
 220
 221         return re->hi & VTD_PAGE_MASK;
 222 }
 223 /*
 224  * low 64 bits:
 225  * 0: present
 226  * 1: fault processing disable
 227  * 2-3: translation type
 228  * 12-63: address space root
 229  * high 64 bits:
 230  * 0-2: address width
 231  * 3-6: aval
 232  * 8-23: domain id
 233  */
 234 struct context_entry {
 235         u64 lo;
 236         u64 hi;
 237 };
 238
 239 static inline void context_clear_pasid_enable(struct context_entry *context)
 240 {
 241         context->lo &= ~(1ULL << 11);
 242 }
 243
 244 static inline bool context_pasid_enabled(struct context_entry *context)
 245 {
 246         return !!(context->lo & (1ULL << 11));
 247 }
 248
 249 static inline void context_set_copied(struct context_entry *context)
 250 {
 251         context->hi |= (1ull << 3);
 252 }
 253
 254 static inline bool context_copied(struct context_entry *context)
 255 {
 256         return !!(context->hi & (1ULL << 3));
 257 }
 258
 259 static inline bool __context_present(struct context_entry *context)
 260 {
 261         return (context->lo & 1);
 262 }
 263
 264 static inline bool context_present(struct context_entry *context)
 265 {
 266         return context_pasid_enabled(context) ?
 267              __context_present(context) :
 268              __context_present(context) && !context_copied(context);
 269 }
 270
 271 static inline void context_set_present(struct context_entry *context)
 272 {
 273         context->lo |= 1;
 274 }
 275
 276 static inline void context_set_fault_enable(struct context_entry *context)
 277 {
 278         context->lo &= (((u64)-1) << 2) | 1;
 279 }
 280
 281 static inline void context_set_translation_type(struct context_entry *context,
 282                                                 unsigned long value)
 283 {
 284         context->lo &= (((u64)-1) << 4) | 3;
 285         context->lo |= (value & 3) << 2;
 286 }
 287
 288 static inline void context_set_address_root(struct context_entry *context,
 289                                             unsigned long value)
 290 {
 291         context->lo &= ~VTD_PAGE_MASK;
 292         context->lo |= value & VTD_PAGE_MASK;
 293 }
 294
 295 static inline void context_set_address_width(struct context_entry *context,
 296                                              unsigned long value)
 297 {
 298         context->hi |= value & 7;
 299 }
 300
 301 static inline void context_set_domain_id(struct context_entry *context,
 302                                          unsigned long value)
 303 {
 304         context->hi |= (value & ((1 << 16) - 1)) << 8;
 305 }
 306
 307 static inline int context_domain_id(struct context_entry *c)
 308 {
 309         return((c->hi >> 8) & 0xffff);
 310 }
 311
 312 static inline void context_clear_entry(struct context_entry *context)
 313 {
 314         context->lo = 0;
 315         context->hi = 0;
 316 }
 317
 318 /*
 319  * 0: readable
 320  * 1: writable
 321  * 2-6: reserved
 322  * 7: super page
 323  * 8-10: available
 324  * 11: snoop behavior
 325  * 12-63: Host physcial address
 326  */
 327 struct dma_pte {
 328         u64 val;
 329 };
 330
 331 static inline void dma_clear_pte(struct dma_pte *pte)
 332 {
 333         pte->val = 0;
 334 }
 335
 336 static inline u64 dma_pte_addr(struct dma_pte *pte)
 337 {
 338 #ifdef CONFIG_64BIT
 339         return pte->val & VTD_PAGE_MASK;
 340 #else
 341         /* Must have a full atomic 64-bit read */
 342         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
 343 #endif
 344 }
 345
 346 static inline bool dma_pte_present(struct dma_pte *pte)
 347 {
 348         return (pte->val & 3) != 0;
 349 }
 350
 351 static inline bool dma_pte_superpage(struct dma_pte *pte)
 352 {
 353         return (pte->val & DMA_PTE_LARGE_PAGE);
 354 }
 355
 356 static inline int first_pte_in_page(struct dma_pte *pte)
 357 {
 358         return !((unsigned long)pte & ~VTD_PAGE_MASK);
 359 }
 360
 361 /*
 362  * This domain is a statically identity mapping domain.
 363  *      1. This domain creats a static 1:1 mapping to all usable memory.
 364  *      2. It maps to each iommu if successful.
 365  *      3. Each iommu mapps to this domain if successful.
 366  */
 367 static struct dmar_domain *si_domain;
 368 static int hw_pass_through = 1;
 369
 370 /*
 371  * Domain represents a virtual machine, more than one devices
 372  * across iommus may be owned in one domain, e.g. kvm guest.
 373  */
 374 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
 375
 376 /* si_domain contains mulitple devices */
 377 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
 378
 379 #define for_each_domain_iommu(idx, domain)                      \
 380         for (idx = 0; idx < g_num_of_iommus; idx++)             \
 381                 if (domain->iommu_refcnt[idx])
 382
 383 struct dmar_domain {
 384         int     nid;                    /* node id */
 385
 386         unsigned        iommu_refcnt[DMAR_UNITS_SUPPORTED];
 387                                         /* Refcount of devices per iommu */
 388
 389
 390         u16             iommu_did[DMAR_UNITS_SUPPORTED];
 391                                         /* Domain ids per IOMMU. Use u16 since
 392                                          * domain ids are 16 bit wide according
 393                                          * to VT-d spec, section 9.3 */
 394
 395         bool has_iotlb_device;
 396         struct list_head devices;       /* all devices' list */
 397         struct iova_domain iovad;       /* iova's that belong to this domain */
 398
 399         struct dma_pte  *pgd;           /* virtual address */
 400         int             gaw;            /* max guest address width */
 401
 402         /* adjusted guest address width, 0 is level 2 30-bit */
 403         int             agaw;
 404
 405         int             flags;          /* flags to find out type of domain */
 406
 407         int             iommu_coherency;/* indicate coherency of iommu access */
 408         int             iommu_snooping; /* indicate snooping control feature*/
 409         int             iommu_count;    /* reference count of iommu */
 410         int             iommu_superpage;/* Level of superpages supported:
 411                                            0 == 4KiB (no superpages), 1 == 2MiB,
 412                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
 413         u64             max_addr;       /* maximum mapped address */
 414
 415         struct iommu_domain domain;     /* generic domain data structure for
 416                                            iommu core */
 417 };
 418
 419 /* PCI domain-device relationship */
 420 struct device_domain_info {
 421         struct list_head link;  /* link to domain siblings */
 422         struct list_head global; /* link to global list */
 423         u8 bus;                 /* PCI bus number */
 424         u8 devfn;               /* PCI devfn number */
 425         u16 pfsid;              /* SRIOV physical function source ID */
 426         u8 pasid_supported:3;
 427         u8 pasid_enabled:1;
 428         u8 pri_supported:1;
 429         u8 pri_enabled:1;
 430         u8 ats_supported:1;
 431         u8 ats_enabled:1;
 432         u8 ats_qdep;
 433         struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
 434         struct intel_iommu *iommu; /* IOMMU used by this device */
 435         struct dmar_domain *domain; /* pointer to domain */
 436 };
 437
 438 struct dmar_rmrr_unit {
 439         struct list_head list;          /* list of rmrr units   */
 440         struct acpi_dmar_header *hdr;   /* ACPI header          */
 441         u64     base_address;           /* reserved base address*/
 442         u64     end_address;            /* reserved end address */
 443         struct dmar_dev_scope *devices; /* target devices */
 444         int     devices_cnt;            /* target device count */
 445 };
 446
 447 struct dmar_atsr_unit {
 448         struct list_head list;          /* list of ATSR units */
 449         struct acpi_dmar_header *hdr;   /* ACPI header */
 450         struct dmar_dev_scope *devices; /* target devices */
 451         int devices_cnt;                /* target device count */
 452         u8 include_all:1;               /* include all ports */
 453 };
 454
 455 static LIST_HEAD(dmar_atsr_units);
 456 static LIST_HEAD(dmar_rmrr_units);
 457
 458 #define for_each_rmrr_units(rmrr) \
 459         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 460
 461 /* bitmap for indexing intel_iommus */
 462 static int g_num_of_iommus;
 463
 464 static void domain_exit(struct dmar_domain *domain);
 465 static void domain_remove_dev_info(struct dmar_domain *domain);
 466 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
 467                                      struct device *dev);
 468 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 469 static void domain_context_clear(struct intel_iommu *iommu,
 470                                  struct device *dev);
 471 static int domain_detach_iommu(struct dmar_domain *domain,
 472                                struct intel_iommu *iommu);
 473
 474 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 475 int dmar_disabled = 0;
 476 #else
 477 int dmar_disabled = 1;
 478 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 479
 480 int intel_iommu_enabled = 0;
 481 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 482
 483 static int dmar_map_gfx = 1;
 484 static int dmar_forcedac;
 485 static int intel_iommu_strict;
 486 static int intel_iommu_superpage = 1;
 487 static int intel_iommu_ecs = 1;
 488 static int intel_iommu_pasid28;
 489 static int iommu_identity_mapping;
 490
 491 #define IDENTMAP_ALL            1
 492 #define IDENTMAP_GFX            2
 493 #define IDENTMAP_AZALIA         4
 494
 495 /* Broadwell and Skylake have broken ECS support — normal so-called "second
 496  * level" translation of DMA requests-without-PASID doesn't actually happen
 497  * unless you also set the NESTE bit in an extended context-entry. Which of
 498  * course means that SVM doesn't work because it's trying to do nested
 499  * translation of the physical addresses it finds in the process page tables,
 500  * through the IOVA->phys mapping found in the "second level" page tables.
 501  *
 502  * The VT-d specification was retroactively changed to change the definition
 503  * of the capability bits and pretend that Broadwell/Skylake never happened...
 504  * but unfortunately the wrong bit was changed. It's ECS which is broken, but
 505  * for some reason it was the PASID capability bit which was redefined (from
 506  * bit 28 on BDW/SKL to bit 40 in future).
 507  *
 508  * So our test for ECS needs to eschew those implementations which set the old
 509  * PASID capabiity bit 28, since those are the ones on which ECS is broken.
 510  * Unless we are working around the 'pasid28' limitations, that is, by putting
 511  * the device into passthrough mode for normal DMA and thus masking the bug.
 512  */
 513 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
 514                             (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap)))
 515 /* PASID support is thus enabled if ECS is enabled and *either* of the old
 516  * or new capability bits are set. */
 517 #define pasid_enabled(iommu) (ecs_enabled(iommu) &&                     \
 518                               (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap)))
 519
 520 int intel_iommu_gfx_mapped;
 521 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 522
 523 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 524 static DEFINE_SPINLOCK(device_domain_lock);
 525 static LIST_HEAD(device_domain_list);
 526
 527 const struct iommu_ops intel_iommu_ops;
 528
 529 static bool translation_pre_enabled(struct intel_iommu *iommu)
 530 {
 531         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 532 }
 533
 534 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 535 {
 536         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 537 }
 538
 539 static void init_translation_status(struct intel_iommu *iommu)
 540 {
 541         u32 gsts;
 542
 543         gsts = readl(iommu->reg + DMAR_GSTS_REG);
 544         if (gsts & DMA_GSTS_TES)
 545                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 546 }
 547
 548 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
 549 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
 550 {
 551         return container_of(dom, struct dmar_domain, domain);
 552 }
 553
 554 static int __init intel_iommu_setup(char *str)
 555 {
 556         if (!str)
 557                 return -EINVAL;
 558         while (*str) {
 559                 if (!strncmp(str, "on", 2)) {
 560                         dmar_disabled = 0;
 561                         pr_info("IOMMU enabled\n");
 562                 } else if (!strncmp(str, "off", 3)) {
 563                         dmar_disabled = 1;
 564                         pr_info("IOMMU disabled\n");
 565                 } else if (!strncmp(str, "igfx_off", 8)) {
 566                         dmar_map_gfx = 0;
 567                         pr_info("Disable GFX device mapping\n");
 568                 } else if (!strncmp(str, "forcedac", 8)) {
 569                         pr_info("Forcing DAC for PCI devices\n");
 570                         dmar_forcedac = 1;
 571                 } else if (!strncmp(str, "strict", 6)) {
 572                         pr_info("Disable batched IOTLB flush\n");
 573                         intel_iommu_strict = 1;
 574                 } else if (!strncmp(str, "sp_off", 6)) {
 575                         pr_info("Disable supported super page\n");
 576                         intel_iommu_superpage = 0;
 577                 } else if (!strncmp(str, "ecs_off", 7)) {
 578                         printk(KERN_INFO
 579                                 "Intel-IOMMU: disable extended context table support\n");
 580                         intel_iommu_ecs = 0;
 581                 } else if (!strncmp(str, "pasid28", 7)) {
 582                         printk(KERN_INFO
 583                                 "Intel-IOMMU: enable pre-production PASID support\n");
 584                         intel_iommu_pasid28 = 1;
 585                         iommu_identity_mapping |= IDENTMAP_GFX;
 586                 } else if (!strncmp(str, "tboot_noforce", 13)) {
 587                         printk(KERN_INFO
 588                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 589                         intel_iommu_tboot_noforce = 1;
 590                 }
 591
 592                 str += strcspn(str, ",");
 593                 while (*str == ',')
 594                         str++;
 595         }
 596         return 0;
 597 }
 598 __setup("intel_iommu=", intel_iommu_setup);
 599
 600 static struct kmem_cache *iommu_domain_cache;
 601 static struct kmem_cache *iommu_devinfo_cache;
 602
 603 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 604 {
 605         struct dmar_domain **domains;
 606         int idx = did >> 8;
 607
 608         domains = iommu->domains[idx];
 609         if (!domains)
 610                 return NULL;
 611
 612         return domains[did & 0xff];
 613 }
 614
 615 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 616                              struct dmar_domain *domain)
 617 {
 618         struct dmar_domain **domains;
 619         int idx = did >> 8;
 620
 621         if (!iommu->domains[idx]) {
 622                 size_t size = 256 * sizeof(struct dmar_domain *);
 623                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 624         }
 625
 626         domains = iommu->domains[idx];
 627         if (WARN_ON(!domains))
 628                 return;
 629         else
 630                 domains[did & 0xff] = domain;
 631 }
 632
 633 static inline void *alloc_pgtable_page(int node)
 634 {
 635         struct page *page;
 636         void *vaddr = NULL;
 637
 638         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 639         if (page)
 640                 vaddr = page_address(page);
 641         return vaddr;
 642 }
 643
 644 static inline void free_pgtable_page(void *vaddr)
 645 {
 646         free_page((unsigned long)vaddr);
 647 }
 648
 649 static inline void *alloc_domain_mem(void)
 650 {
 651         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 652 }
 653
 654 static void free_domain_mem(void *vaddr)
 655 {
 656         kmem_cache_free(iommu_domain_cache, vaddr);
 657 }
 658
 659 static inline void * alloc_devinfo_mem(void)
 660 {
 661         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 662 }
 663
 664 static inline void free_devinfo_mem(void *vaddr)
 665 {
 666         kmem_cache_free(iommu_devinfo_cache, vaddr);
 667 }
 668
 669 static inline int domain_type_is_vm(struct dmar_domain *domain)
 670 {
 671         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
 672 }
 673
 674 static inline int domain_type_is_si(struct dmar_domain *domain)
 675 {
 676         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 677 }
 678
 679 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
 680 {
 681         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
 682                                 DOMAIN_FLAG_STATIC_IDENTITY);
 683 }
 684
 685 static inline int domain_pfn_supported(struct dmar_domain *domain,
 686                                        unsigned long pfn)
 687 {
 688         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 689
 690         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 691 }
 692
 693 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 694 {
 695         unsigned long sagaw;
 696         int agaw = -1;
 697
 698         sagaw = cap_sagaw(iommu->cap);
 699         for (agaw = width_to_agaw(max_gaw);
 700              agaw >= 0; agaw--) {
 701                 if (test_bit(agaw, &sagaw))
 702                         break;
 703         }
 704
 705         return agaw;
 706 }
 707
 708 /*
 709  * Calculate max SAGAW for each iommu.
 710  */
 711 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 712 {
 713         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 714 }
 715
 716 /*
 717  * calculate agaw for each iommu.
 718  * "SAGAW" may be different across iommus, use a default agaw, and
 719  * get a supported less agaw for iommus that don't support the default agaw.
 720  */
 721 int iommu_calculate_agaw(struct intel_iommu *iommu)
 722 {
 723         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 724 }
 725
 726 /* This functionin only returns single iommu in a domain */
 727 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 728 {
 729         int iommu_id;
 730
 731         /* si_domain and vm domain should not get here. */
 732         BUG_ON(domain_type_is_vm_or_si(domain));
 733         for_each_domain_iommu(iommu_id, domain)
 734                 break;
 735
 736         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 737                 return NULL;
 738
 739         return g_iommus[iommu_id];
 740 }
 741
 742 static void domain_update_iommu_coherency(struct dmar_domain *domain)
 743 {
 744         struct dmar_drhd_unit *drhd;
 745         struct intel_iommu *iommu;
 746         bool found = false;
 747         int i;
 748
 749         domain->iommu_coherency = 1;
 750
 751         for_each_domain_iommu(i, domain) {
 752                 found = true;
 753                 if (!ecap_coherent(g_iommus[i]->ecap)) {
 754                         domain->iommu_coherency = 0;
 755                         break;
 756                 }
 757         }
 758         if (found)
 759                 return;
 760
 761         /* No hardware attached; use lowest common denominator */
 762         rcu_read_lock();
 763         for_each_active_iommu(iommu, drhd) {
 764                 if (!ecap_coherent(iommu->ecap)) {
 765                         domain->iommu_coherency = 0;
 766                         break;
 767                 }
 768         }
 769         rcu_read_unlock();
 770 }
 771
 772 static int domain_update_iommu_snooping(struct intel_iommu *skip)
 773 {
 774         struct dmar_drhd_unit *drhd;
 775         struct intel_iommu *iommu;
 776         int ret = 1;
 777
 778         rcu_read_lock();
 779         for_each_active_iommu(iommu, drhd) {
 780                 if (iommu != skip) {
 781                         if (!ecap_sc_support(iommu->ecap)) {
 782                                 ret = 0;
 783                                 break;
 784                         }
 785                 }
 786         }
 787         rcu_read_unlock();
 788
 789         return ret;
 790 }
 791
 792 static int domain_update_iommu_superpage(struct intel_iommu *skip)
 793 {
 794         struct dmar_drhd_unit *drhd;
 795         struct intel_iommu *iommu;
 796         int mask = 0xf;
 797
 798         if (!intel_iommu_superpage) {
 799                 return 0;
 800         }
 801
 802         /* set iommu_superpage to the smallest common denominator */
 803         rcu_read_lock();
 804         for_each_active_iommu(iommu, drhd) {
 805                 if (iommu != skip) {
 806                         mask &= cap_super_page_val(iommu->cap);
 807                         if (!mask)
 808                                 break;
 809                 }
 810         }
 811         rcu_read_unlock();
 812
 813         return fls(mask);
 814 }
 815
 816 /* Some capabilities may be different across iommus */
 817 static void domain_update_iommu_cap(struct dmar_domain *domain)
 818 {
 819         domain_update_iommu_coherency(domain);
 820         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 821         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
 822 }
 823
 824 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
 825                                                        u8 bus, u8 devfn, int alloc)
 826 {
 827         struct root_entry *root = &iommu->root_entry[bus];
 828         struct context_entry *context;
 829         u64 *entry;
 830
 831         entry = &root->lo;
 832         if (ecs_enabled(iommu)) {
 833                 if (devfn >= 0x80) {
 834                         devfn -= 0x80;
 835                         entry = &root->hi;
 836                 }
 837                 devfn *= 2;
 838         }
 839         if (*entry & 1)
 840                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
 841         else {
 842                 unsigned long phy_addr;
 843                 if (!alloc)
 844                         return NULL;
 845
 846                 context = alloc_pgtable_page(iommu->node);
 847                 if (!context)
 848                         return NULL;
 849
 850                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 851                 phy_addr = virt_to_phys((void *)context);
 852                 *entry = phy_addr | 1;
 853                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
 854         }
 855         return &context[devfn];
 856 }
 857
 858 static int iommu_dummy(struct device *dev)
 859 {
 860         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
 861 }
 862
 863 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 864 {
 865         struct dmar_drhd_unit *drhd = NULL;
 866         struct intel_iommu *iommu;
 867         struct device *tmp;
 868         struct pci_dev *ptmp, *pdev = NULL;
 869         u16 segment = 0;
 870         int i;
 871
 872         if (iommu_dummy(dev))
 873                 return NULL;
 874
 875         if (dev_is_pci(dev)) {
 876                 struct pci_dev *pf_pdev;
 877
 878                 pdev = to_pci_dev(dev);
 879
 880 #ifdef CONFIG_X86
 881                 /* VMD child devices currently cannot be handled individually */
 882                 if (is_vmd(pdev->bus))
 883                         return NULL;
 884 #endif
 885
 886                 /* VFs aren't listed in scope tables; we need to look up
 887                  * the PF instead to find the IOMMU. */
 888                 pf_pdev = pci_physfn(pdev);
 889                 dev = &pf_pdev->dev;
 890                 segment = pci_domain_nr(pdev->bus);
 891         } else if (has_acpi_companion(dev))
 892                 dev = &ACPI_COMPANION(dev)->dev;
 893
 894         rcu_read_lock();
 895         for_each_active_iommu(iommu, drhd) {
 896                 if (pdev && segment != drhd->segment)
 897                         continue;
 898
 899                 for_each_active_dev_scope(drhd->devices,
 900                                           drhd->devices_cnt, i, tmp) {
 901                         if (tmp == dev) {
 902                                 /* For a VF use its original BDF# not that of the PF
 903                                  * which we used for the IOMMU lookup. Strictly speaking
 904                                  * we could do this for all PCI devices; we only need to
 905                                  * get the BDF# from the scope table for ACPI matches. */
 906                                 if (pdev && pdev->is_virtfn)
 907                                         goto got_pdev;
 908
 909                                 *bus = drhd->devices[i].bus;
 910                                 *devfn = drhd->devices[i].devfn;
 911                                 goto out;
 912                         }
 913
 914                         if (!pdev || !dev_is_pci(tmp))
 915                                 continue;
 916
 917                         ptmp = to_pci_dev(tmp);
 918                         if (ptmp->subordinate &&
 919                             ptmp->subordinate->number <= pdev->bus->number &&
 920                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
 921                                 goto got_pdev;
 922                 }
 923
 924                 if (pdev && drhd->include_all) {
 925                 got_pdev:
 926                         *bus = pdev->bus->number;
 927                         *devfn = pdev->devfn;
 928                         goto out;
 929                 }
 930         }
 931         iommu = NULL;
 932  out:
 933         rcu_read_unlock();
 934
 935         return iommu;
 936 }
 937
 938 static void domain_flush_cache(struct dmar_domain *domain,
 939                                void *addr, int size)
 940 {
 941         if (!domain->iommu_coherency)
 942                 clflush_cache_range(addr, size);
 943 }
 944
 945 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 946 {
 947         struct context_entry *context;
 948         int ret = 0;
 949         unsigned long flags;
 950
 951         spin_lock_irqsave(&iommu->lock, flags);
 952         context = iommu_context_addr(iommu, bus, devfn, 0);
 953         if (context)
 954                 ret = context_present(context);
 955         spin_unlock_irqrestore(&iommu->lock, flags);
 956         return ret;
 957 }
 958
 959 static void free_context_table(struct intel_iommu *iommu)
 960 {
 961         int i;
 962         unsigned long flags;
 963         struct context_entry *context;
 964
 965         spin_lock_irqsave(&iommu->lock, flags);
 966         if (!iommu->root_entry) {
 967                 goto out;
 968         }
 969         for (i = 0; i < ROOT_ENTRY_NR; i++) {
 970                 context = iommu_context_addr(iommu, i, 0, 0);
 971                 if (context)
 972                         free_pgtable_page(context);
 973
 974                 if (!ecs_enabled(iommu))
 975                         continue;
 976
 977                 context = iommu_context_addr(iommu, i, 0x80, 0);
 978                 if (context)
 979                         free_pgtable_page(context);
 980
 981         }
 982         free_pgtable_page(iommu->root_entry);
 983         iommu->root_entry = NULL;
 984 out:
 985         spin_unlock_irqrestore(&iommu->lock, flags);
 986 }
 987
 988 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 989                                       unsigned long pfn, int *target_level)
 990 {
 991         struct dma_pte *parent, *pte = NULL;
 992         int level = agaw_to_level(domain->agaw);
 993         int offset;
 994
 995         BUG_ON(!domain->pgd);
 996
 997         if (!domain_pfn_supported(domain, pfn))
 998                 /* Address beyond IOMMU's addressing capabilities. */
 999                 return NULL;
1000
1001         parent = domain->pgd;
1002
1003         while (1) {
1004                 void *tmp_page;
1005
1006                 offset = pfn_level_offset(pfn, level);
1007                 pte = &parent[offset];
1008                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1009                         break;
1010                 if (level == *target_level)
1011                         break;
1012
1013                 if (!dma_pte_present(pte)) {
1014                         uint64_t pteval;
1015
1016                         tmp_page = alloc_pgtable_page(domain->nid);
1017
1018                         if (!tmp_page)
1019                                 return NULL;
1020
1021                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1022                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1023                         if (cmpxchg64(&pte->val, 0ULL, pteval))
1024                                 /* Someone else set it while we were thinking; use theirs. */
1025                                 free_pgtable_page(tmp_page);
1026                         else
1027                                 domain_flush_cache(domain, pte, sizeof(*pte));
1028                 }
1029                 if (level == 1)
1030                         break;
1031
1032                 parent = phys_to_virt(dma_pte_addr(pte));
1033                 level--;
1034         }
1035
1036         if (!*target_level)
1037                 *target_level = level;
1038
1039         return pte;
1040 }
1041
1042
1043 /* return address's pte at specific level */
1044 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1045                                          unsigned long pfn,
1046                                          int level, int *large_page)
1047 {
1048         struct dma_pte *parent, *pte = NULL;
1049         int total = agaw_to_level(domain->agaw);
1050         int offset;
1051
1052         parent = domain->pgd;
1053         while (level <= total) {
1054                 offset = pfn_level_offset(pfn, total);
1055                 pte = &parent[offset];
1056                 if (level == total)
1057                         return pte;
1058
1059                 if (!dma_pte_present(pte)) {
1060                         *large_page = total;
1061                         break;
1062                 }
1063
1064                 if (dma_pte_superpage(pte)) {
1065                         *large_page = total;
1066                         return pte;
1067                 }
1068
1069                 parent = phys_to_virt(dma_pte_addr(pte));
1070                 total--;
1071         }
1072         return NULL;
1073 }
1074
1075 /* clear last level pte, a tlb flush should be followed */
1076 static void dma_pte_clear_range(struct dmar_domain *domain,
1077                                 unsigned long start_pfn,
1078                                 unsigned long last_pfn)
1079 {
1080         unsigned int large_page = 1;
1081         struct dma_pte *first_pte, *pte;
1082
1083         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1084         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1085         BUG_ON(start_pfn > last_pfn);
1086
1087         /* we don't need lock here; nobody else touches the iova range */
1088         do {
1089                 large_page = 1;
1090                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1091                 if (!pte) {
1092                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1093                         continue;
1094                 }
1095                 do {
1096                         dma_clear_pte(pte);
1097                         start_pfn += lvl_to_nr_pages(large_page);
1098                         pte++;
1099                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1100
1101                 domain_flush_cache(domain, first_pte,
1102                                    (void *)pte - (void *)first_pte);
1103
1104         } while (start_pfn && start_pfn <= last_pfn);
1105 }
1106
1107 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1108                                int retain_level, struct dma_pte *pte,
1109                                unsigned long pfn, unsigned long start_pfn,
1110                                unsigned long last_pfn)
1111 {
1112         pfn = max(start_pfn, pfn);
1113         pte = &pte[pfn_level_offset(pfn, level)];
1114
1115         do {
1116                 unsigned long level_pfn;
1117                 struct dma_pte *level_pte;
1118
1119                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1120                         goto next;
1121
1122                 level_pfn = pfn & level_mask(level);
1123                 level_pte = phys_to_virt(dma_pte_addr(pte));
1124
1125                 if (level > 2) {
1126                         dma_pte_free_level(domain, level - 1, retain_level,
1127                                            level_pte, level_pfn, start_pfn,
1128                                            last_pfn);
1129                 }
1130
1131                 /*
1132                  * Free the page table if we're below the level we want to
1133                  * retain and the range covers the entire table.
1134                  */
1135                 if (level < retain_level && !(start_pfn > level_pfn ||
1136                       last_pfn < level_pfn + level_size(level) - 1)) {
1137                         dma_clear_pte(pte);
1138                         domain_flush_cache(domain, pte, sizeof(*pte));
1139                         free_pgtable_page(level_pte);
1140                 }
1141 next:
1142                 pfn += level_size(level);
1143         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1144 }
1145
1146 /*
1147  * clear last level (leaf) ptes and free page table pages below the
1148  * level we wish to keep intact.
1149  */
1150 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1151                                    unsigned long start_pfn,
1152                                    unsigned long last_pfn,
1153                                    int retain_level)
1154 {
1155         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1156         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1157         BUG_ON(start_pfn > last_pfn);
1158
1159         dma_pte_clear_range(domain, start_pfn, last_pfn);
1160
1161         /* We don't need lock here; nobody else touches the iova range */
1162         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1163                            domain->pgd, 0, start_pfn, last_pfn);
1164
1165         /* free pgd */
1166         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1167                 free_pgtable_page(domain->pgd);
1168                 domain->pgd = NULL;
1169         }
1170 }
1171
1172 /* When a page at a given level is being unlinked from its parent, we don't
1173    need to *modify* it at all. All we need to do is make a list of all the
1174    pages which can be freed just as soon as we've flushed the IOTLB and we
1175    know the hardware page-walk will no longer touch them.
1176    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1177    be freed. */
1178 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1179                                             int level, struct dma_pte *pte,
1180                                             struct page *freelist)
1181 {
1182         struct page *pg;
1183
1184         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1185         pg->freelist = freelist;
1186         freelist = pg;
1187
1188         if (level == 1)
1189                 return freelist;
1190
1191         pte = page_address(pg);
1192         do {
1193                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1194                         freelist = dma_pte_list_pagetables(domain, level - 1,
1195                                                            pte, freelist);
1196                 pte++;
1197         } while (!first_pte_in_page(pte));
1198
1199         return freelist;
1200 }
1201
1202 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1203                                         struct dma_pte *pte, unsigned long pfn,
1204                                         unsigned long start_pfn,
1205                                         unsigned long last_pfn,
1206                                         struct page *freelist)
1207 {
1208         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1209
1210         pfn = max(start_pfn, pfn);
1211         pte = &pte[pfn_level_offset(pfn, level)];
1212
1213         do {
1214                 unsigned long level_pfn;
1215
1216                 if (!dma_pte_present(pte))
1217                         goto next;
1218
1219                 level_pfn = pfn & level_mask(level);
1220
1221                 /* If range covers entire pagetable, free it */
1222                 if (start_pfn <= level_pfn &&
1223                     last_pfn >= level_pfn + level_size(level) - 1) {
1224                         /* These suborbinate page tables are going away entirely. Don't
1225                            bother to clear them; we're just going to *free* them. */
1226                         if (level > 1 && !dma_pte_superpage(pte))
1227                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1228
1229                         dma_clear_pte(pte);
1230                         if (!first_pte)
1231                                 first_pte = pte;
1232                         last_pte = pte;
1233                 } else if (level > 1) {
1234                         /* Recurse down into a level that isn't *entirely* obsolete */
1235                         freelist = dma_pte_clear_level(domain, level - 1,
1236                                                        phys_to_virt(dma_pte_addr(pte)),
1237                                                        level_pfn, start_pfn, last_pfn,
1238                                                        freelist);
1239                 }
1240 next:
1241                 pfn += level_size(level);
1242         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1243
1244         if (first_pte)
1245                 domain_flush_cache(domain, first_pte,
1246                                    (void *)++last_pte - (void *)first_pte);
1247
1248         return freelist;
1249 }
1250
1251 /* We can't just free the pages because the IOMMU may still be walking
1252    the page tables, and may have cached the intermediate levels. The
1253    pages can only be freed after the IOTLB flush has been done. */
1254 static struct page *domain_unmap(struct dmar_domain *domain,
1255                                  unsigned long start_pfn,
1256                                  unsigned long last_pfn)
1257 {
1258         struct page *freelist = NULL;
1259
1260         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1261         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1262         BUG_ON(start_pfn > last_pfn);
1263
1264         /* we don't need lock here; nobody else touches the iova range */
1265         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1266                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1267
1268         /* free pgd */
1269         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1270                 struct page *pgd_page = virt_to_page(domain->pgd);
1271                 pgd_page->freelist = freelist;
1272                 freelist = pgd_page;
1273
1274                 domain->pgd = NULL;
1275         }
1276
1277         return freelist;
1278 }
1279
1280 static void dma_free_pagelist(struct page *freelist)
1281 {
1282         struct page *pg;
1283
1284         while ((pg = freelist)) {
1285                 freelist = pg->freelist;
1286                 free_pgtable_page(page_address(pg));
1287         }
1288 }
1289
1290 static void iova_entry_free(unsigned long data)
1291 {
1292         struct page *freelist = (struct page *)data;
1293
1294         dma_free_pagelist(freelist);
1295 }
1296
1297 /* iommu handling */
1298 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1299 {
1300         struct root_entry *root;
1301         unsigned long flags;
1302
1303         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1304         if (!root) {
1305                 pr_err("Allocating root entry for %s failed\n",
1306                         iommu->name);
1307                 return -ENOMEM;
1308         }
1309
1310         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1311
1312         spin_lock_irqsave(&iommu->lock, flags);
1313         iommu->root_entry = root;
1314         spin_unlock_irqrestore(&iommu->lock, flags);
1315
1316         return 0;
1317 }
1318
1319 static void iommu_set_root_entry(struct intel_iommu *iommu)
1320 {
1321         u64 addr;
1322         u32 sts;
1323         unsigned long flag;
1324
1325         addr = virt_to_phys(iommu->root_entry);
1326         if (ecs_enabled(iommu))
1327                 addr |= DMA_RTADDR_RTT;
1328
1329         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1330         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1331
1332         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1333
1334         /* Make sure hardware complete it */
1335         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1336                       readl, (sts & DMA_GSTS_RTPS), sts);
1337
1338         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1339 }
1340
1341 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1342 {
1343         u32 val;
1344         unsigned long flag;
1345
1346         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1347                 return;
1348
1349         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1350         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1351
1352         /* Make sure hardware complete it */
1353         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1354                       readl, (!(val & DMA_GSTS_WBFS)), val);
1355
1356         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1357 }
1358
1359 /* return value determine if we need a write buffer flush */
1360 static void __iommu_flush_context(struct intel_iommu *iommu,
1361                                   u16 did, u16 source_id, u8 function_mask,
1362                                   u64 type)
1363 {
1364         u64 val = 0;
1365         unsigned long flag;
1366
1367         switch (type) {
1368         case DMA_CCMD_GLOBAL_INVL:
1369                 val = DMA_CCMD_GLOBAL_INVL;
1370                 break;
1371         case DMA_CCMD_DOMAIN_INVL:
1372                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1373                 break;
1374         case DMA_CCMD_DEVICE_INVL:
1375                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1376                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1377                 break;
1378         default:
1379                 BUG();
1380         }
1381         val |= DMA_CCMD_ICC;
1382
1383         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1384         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1385
1386         /* Make sure hardware complete it */
1387         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1388                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1389
1390         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1391 }
1392
1393 /* return value determine if we need a write buffer flush */
1394 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1395                                 u64 addr, unsigned int size_order, u64 type)
1396 {
1397         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1398         u64 val = 0, val_iva = 0;
1399         unsigned long flag;
1400
1401         switch (type) {
1402         case DMA_TLB_GLOBAL_FLUSH:
1403                 /* global flush doesn't need set IVA_REG */
1404                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1405                 break;
1406         case DMA_TLB_DSI_FLUSH:
1407                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1408                 break;
1409         case DMA_TLB_PSI_FLUSH:
1410                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1411                 /* IH bit is passed in as part of address */
1412                 val_iva = size_order | addr;
1413                 break;
1414         default:
1415                 BUG();
1416         }
1417         /* Note: set drain read/write */
1418 #if 0
1419         /*
1420          * This is probably to be super secure.. Looks like we can
1421          * ignore it without any impact.
1422          */
1423         if (cap_read_drain(iommu->cap))
1424                 val |= DMA_TLB_READ_DRAIN;
1425 #endif
1426         if (cap_write_drain(iommu->cap))
1427                 val |= DMA_TLB_WRITE_DRAIN;
1428
1429         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1430         /* Note: Only uses first TLB reg currently */
1431         if (val_iva)
1432                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1433         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1434
1435         /* Make sure hardware complete it */
1436         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1437                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1438
1439         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1440
1441         /* check IOTLB invalidation granularity */
1442         if (DMA_TLB_IAIG(val) == 0)
1443                 pr_err("Flush IOTLB failed\n");
1444         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1445                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1446                         (unsigned long long)DMA_TLB_IIRG(type),
1447                         (unsigned long long)DMA_TLB_IAIG(val));
1448 }
1449
1450 static struct device_domain_info *
1451 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1452                          u8 bus, u8 devfn)
1453 {
1454         struct device_domain_info *info;
1455
1456         assert_spin_locked(&device_domain_lock);
1457
1458         if (!iommu->qi)
1459                 return NULL;
1460
1461         list_for_each_entry(info, &domain->devices, link)
1462                 if (info->iommu == iommu && info->bus == bus &&
1463                     info->devfn == devfn) {
1464                         if (info->ats_supported && info->dev)
1465                                 return info;
1466                         break;
1467                 }
1468
1469         return NULL;
1470 }
1471
1472 static void domain_update_iotlb(struct dmar_domain *domain)
1473 {
1474         struct device_domain_info *info;
1475         bool has_iotlb_device = false;
1476
1477         assert_spin_locked(&device_domain_lock);
1478
1479         list_for_each_entry(info, &domain->devices, link) {
1480                 struct pci_dev *pdev;
1481
1482                 if (!info->dev || !dev_is_pci(info->dev))
1483                         continue;
1484
1485                 pdev = to_pci_dev(info->dev);
1486                 if (pdev->ats_enabled) {
1487                         has_iotlb_device = true;
1488                         break;
1489                 }
1490         }
1491
1492         domain->has_iotlb_device = has_iotlb_device;
1493 }
1494
1495 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1496 {
1497         struct pci_dev *pdev;
1498
1499         assert_spin_locked(&device_domain_lock);
1500
1501         if (!info || !dev_is_pci(info->dev))
1502                 return;
1503
1504         pdev = to_pci_dev(info->dev);
1505         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1506          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1507          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1508          * reserved, which should be set to 0.
1509          */
1510         if (!ecap_dit(info->iommu->ecap))
1511                 info->pfsid = 0;
1512         else {
1513                 struct pci_dev *pf_pdev;
1514
1515                 /* pdev will be returned if device is not a vf */
1516                 pf_pdev = pci_physfn(pdev);
1517                 info->pfsid = PCI_DEVID(pf_pdev->bus->number, pf_pdev->devfn);
1518         }
1519
1520 #ifdef CONFIG_INTEL_IOMMU_SVM
1521         /* The PCIe spec, in its wisdom, declares that the behaviour of
1522            the device if you enable PASID support after ATS support is
1523            undefined. So always enable PASID support on devices which
1524            have it, even if we can't yet know if we're ever going to
1525            use it. */
1526         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1527                 info->pasid_enabled = 1;
1528
1529         if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1530                 info->pri_enabled = 1;
1531 #endif
1532         if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1533                 info->ats_enabled = 1;
1534                 domain_update_iotlb(info->domain);
1535                 info->ats_qdep = pci_ats_queue_depth(pdev);
1536         }
1537 }
1538
1539 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1540 {
1541         struct pci_dev *pdev;
1542
1543         assert_spin_locked(&device_domain_lock);
1544
1545         if (!dev_is_pci(info->dev))
1546                 return;
1547
1548         pdev = to_pci_dev(info->dev);
1549
1550         if (info->ats_enabled) {
1551                 pci_disable_ats(pdev);
1552                 info->ats_enabled = 0;
1553                 domain_update_iotlb(info->domain);
1554         }
1555 #ifdef CONFIG_INTEL_IOMMU_SVM
1556         if (info->pri_enabled) {
1557                 pci_disable_pri(pdev);
1558                 info->pri_enabled = 0;
1559         }
1560         if (info->pasid_enabled) {
1561                 pci_disable_pasid(pdev);
1562                 info->pasid_enabled = 0;
1563         }
1564 #endif
1565 }
1566
1567 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1568                                   u64 addr, unsigned mask)
1569 {
1570         u16 sid, qdep;
1571         unsigned long flags;
1572         struct device_domain_info *info;
1573
1574         if (!domain->has_iotlb_device)
1575                 return;
1576
1577         spin_lock_irqsave(&device_domain_lock, flags);
1578         list_for_each_entry(info, &domain->devices, link) {
1579                 if (!info->ats_enabled)
1580                         continue;
1581
1582                 sid = info->bus << 8 | info->devfn;
1583                 qdep = info->ats_qdep;
1584                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1585                                 qdep, addr, mask);
1586         }
1587         spin_unlock_irqrestore(&device_domain_lock, flags);
1588 }
1589
1590 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1591                                   struct dmar_domain *domain,
1592                                   unsigned long pfn, unsigned int pages,
1593                                   int ih, int map)
1594 {
1595         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1596         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1597         u16 did = domain->iommu_did[iommu->seq_id];
1598
1599         BUG_ON(pages == 0);
1600
1601         if (ih)
1602                 ih = 1 << 6;
1603         /*
1604          * Fallback to domain selective flush if no PSI support or the size is
1605          * too big.
1606          * PSI requires page size to be 2 ^ x, and the base address is naturally
1607          * aligned to the size
1608          */
1609         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1610                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1611                                                 DMA_TLB_DSI_FLUSH);
1612         else
1613                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1614                                                 DMA_TLB_PSI_FLUSH);
1615
1616         /*
1617          * In caching mode, changes of pages from non-present to present require
1618          * flush. However, device IOTLB doesn't need to be flushed in this case.
1619          */
1620         if (!cap_caching_mode(iommu->cap) || !map)
1621                 iommu_flush_dev_iotlb(domain, addr, mask);
1622 }
1623
1624 static void iommu_flush_iova(struct iova_domain *iovad)
1625 {
1626         struct dmar_domain *domain;
1627         int idx;
1628
1629         domain = container_of(iovad, struct dmar_domain, iovad);
1630
1631         for_each_domain_iommu(idx, domain) {
1632                 struct intel_iommu *iommu = g_iommus[idx];
1633                 u16 did = domain->iommu_did[iommu->seq_id];
1634
1635                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1636
1637                 if (!cap_caching_mode(iommu->cap))
1638                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1639                                               0, MAX_AGAW_PFN_WIDTH);
1640         }
1641 }
1642
1643 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1644 {
1645         u32 pmen;
1646         unsigned long flags;
1647
1648         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1649                 return;
1650
1651         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1652         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1653         pmen &= ~DMA_PMEN_EPM;
1654         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1655
1656         /* wait for the protected region status bit to clear */
1657         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1658                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1659
1660         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1661 }
1662
1663 static void iommu_enable_translation(struct intel_iommu *iommu)
1664 {
1665         u32 sts;
1666         unsigned long flags;
1667
1668         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1669         iommu->gcmd |= DMA_GCMD_TE;
1670         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1671
1672         /* Make sure hardware complete it */
1673         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1674                       readl, (sts & DMA_GSTS_TES), sts);
1675
1676         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1677 }
1678
1679 static void iommu_disable_translation(struct intel_iommu *iommu)
1680 {
1681         u32 sts;
1682         unsigned long flag;
1683
1684         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1685         iommu->gcmd &= ~DMA_GCMD_TE;
1686         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1687
1688         /* Make sure hardware complete it */
1689         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1690                       readl, (!(sts & DMA_GSTS_TES)), sts);
1691
1692         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1693 }
1694
1695
1696 static int iommu_init_domains(struct intel_iommu *iommu)
1697 {
1698         u32 ndomains, nlongs;
1699         size_t size;
1700
1701         ndomains = cap_ndoms(iommu->cap);
1702         pr_debug("%s: Number of Domains supported <%d>\n",
1703                  iommu->name, ndomains);
1704         nlongs = BITS_TO_LONGS(ndomains);
1705
1706         spin_lock_init(&iommu->lock);
1707
1708         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1709         if (!iommu->domain_ids) {
1710                 pr_err("%s: Allocating domain id array failed\n",
1711                        iommu->name);
1712                 return -ENOMEM;
1713         }
1714
1715         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1716         iommu->domains = kzalloc(size, GFP_KERNEL);
1717
1718         if (iommu->domains) {
1719                 size = 256 * sizeof(struct dmar_domain *);
1720                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1721         }
1722
1723         if (!iommu->domains || !iommu->domains[0]) {
1724                 pr_err("%s: Allocating domain array failed\n",
1725                        iommu->name);
1726                 kfree(iommu->domain_ids);
1727                 kfree(iommu->domains);
1728                 iommu->domain_ids = NULL;
1729                 iommu->domains    = NULL;
1730                 return -ENOMEM;
1731         }
1732
1733
1734
1735         /*
1736          * If Caching mode is set, then invalid translations are tagged
1737          * with domain-id 0, hence we need to pre-allocate it. We also
1738          * use domain-id 0 as a marker for non-allocated domain-id, so
1739          * make sure it is not used for a real domain.
1740          */
1741         set_bit(0, iommu->domain_ids);
1742
1743         return 0;
1744 }
1745
1746 static void disable_dmar_iommu(struct intel_iommu *iommu)
1747 {
1748         struct device_domain_info *info, *tmp;
1749         unsigned long flags;
1750
1751         if (!iommu->domains || !iommu->domain_ids)
1752                 return;
1753
1754 again:
1755         spin_lock_irqsave(&device_domain_lock, flags);
1756         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1757                 struct dmar_domain *domain;
1758
1759                 if (info->iommu != iommu)
1760                         continue;
1761
1762                 if (!info->dev || !info->domain)
1763                         continue;
1764
1765                 domain = info->domain;
1766
1767                 __dmar_remove_one_dev_info(info);
1768
1769                 if (!domain_type_is_vm_or_si(domain)) {
1770                         /*
1771                          * The domain_exit() function  can't be called under
1772                          * device_domain_lock, as it takes this lock itself.
1773                          * So release the lock here and re-run the loop
1774                          * afterwards.
1775                          */
1776                         spin_unlock_irqrestore(&device_domain_lock, flags);
1777                         domain_exit(domain);
1778                         goto again;
1779                 }
1780         }
1781         spin_unlock_irqrestore(&device_domain_lock, flags);
1782
1783         if (iommu->gcmd & DMA_GCMD_TE)
1784                 iommu_disable_translation(iommu);
1785 }
1786
1787 static void free_dmar_iommu(struct intel_iommu *iommu)
1788 {
1789         if ((iommu->domains) && (iommu->domain_ids)) {
1790                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1791                 int i;
1792
1793                 for (i = 0; i < elems; i++)
1794                         kfree(iommu->domains[i]);
1795                 kfree(iommu->domains);
1796                 kfree(iommu->domain_ids);
1797                 iommu->domains = NULL;
1798                 iommu->domain_ids = NULL;
1799         }
1800
1801         g_iommus[iommu->seq_id] = NULL;
1802
1803         /* free context mapping */
1804         free_context_table(iommu);
1805
1806 #ifdef CONFIG_INTEL_IOMMU_SVM
1807         if (pasid_enabled(iommu)) {
1808                 if (ecap_prs(iommu->ecap))
1809                         intel_svm_finish_prq(iommu);
1810                 intel_svm_free_pasid_tables(iommu);
1811         }
1812 #endif
1813 }
1814
1815 static struct dmar_domain *alloc_domain(int flags)
1816 {
1817         struct dmar_domain *domain;
1818
1819         domain = alloc_domain_mem();
1820         if (!domain)
1821                 return NULL;
1822
1823         memset(domain, 0, sizeof(*domain));
1824         domain->nid = -1;
1825         domain->flags = flags;
1826         domain->has_iotlb_device = false;
1827         INIT_LIST_HEAD(&domain->devices);
1828
1829         return domain;
1830 }
1831
1832 /* Must be called with iommu->lock */
1833 static int domain_attach_iommu(struct dmar_domain *domain,
1834                                struct intel_iommu *iommu)
1835 {
1836         unsigned long ndomains;
1837         int num;
1838
1839         assert_spin_locked(&device_domain_lock);
1840         assert_spin_locked(&iommu->lock);
1841
1842         domain->iommu_refcnt[iommu->seq_id] += 1;
1843         domain->iommu_count += 1;
1844         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1845                 ndomains = cap_ndoms(iommu->cap);
1846                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1847
1848                 if (num >= ndomains) {
1849                         pr_err("%s: No free domain ids\n", iommu->name);
1850                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1851                         domain->iommu_count -= 1;
1852                         return -ENOSPC;
1853                 }
1854
1855                 set_bit(num, iommu->domain_ids);
1856                 set_iommu_domain(iommu, num, domain);
1857
1858                 domain->iommu_did[iommu->seq_id] = num;
1859                 domain->nid                      = iommu->node;
1860
1861                 domain_update_iommu_cap(domain);
1862         }
1863
1864         return 0;
1865 }
1866
1867 static int domain_detach_iommu(struct dmar_domain *domain,
1868                                struct intel_iommu *iommu)
1869 {
1870         int num, count = INT_MAX;
1871
1872         assert_spin_locked(&device_domain_lock);
1873         assert_spin_locked(&iommu->lock);
1874
1875         domain->iommu_refcnt[iommu->seq_id] -= 1;
1876         count = --domain->iommu_count;
1877         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1878                 num = domain->iommu_did[iommu->seq_id];
1879                 clear_bit(num, iommu->domain_ids);
1880                 set_iommu_domain(iommu, num, NULL);
1881
1882                 domain_update_iommu_cap(domain);
1883                 domain->iommu_did[iommu->seq_id] = 0;
1884         }
1885
1886         return count;
1887 }
1888
1889 static struct iova_domain reserved_iova_list;
1890 static struct lock_class_key reserved_rbtree_key;
1891
1892 static int dmar_init_reserved_ranges(void)
1893 {
1894         struct pci_dev *pdev = NULL;
1895         struct iova *iova;
1896         int i;
1897
1898         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
1899                         DMA_32BIT_PFN);
1900
1901         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1902                 &reserved_rbtree_key);
1903
1904         /* IOAPIC ranges shouldn't be accessed by DMA */
1905         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1906                 IOVA_PFN(IOAPIC_RANGE_END));
1907         if (!iova) {
1908                 pr_err("Reserve IOAPIC range failed\n");
1909                 return -ENODEV;
1910         }
1911
1912         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1913         for_each_pci_dev(pdev) {
1914                 struct resource *r;
1915
1916                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1917                         r = &pdev->resource[i];
1918                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1919                                 continue;
1920                         iova = reserve_iova(&reserved_iova_list,
1921                                             IOVA_PFN(r->start),
1922                                             IOVA_PFN(r->end));
1923                         if (!iova) {
1924                                 pr_err("Reserve iova failed\n");
1925                                 return -ENODEV;
1926                         }
1927                 }
1928         }
1929         return 0;
1930 }
1931
1932 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1933 {
1934         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1935 }
1936
1937 static inline int guestwidth_to_adjustwidth(int gaw)
1938 {
1939         int agaw;
1940         int r = (gaw - 12) % 9;
1941
1942         if (r == 0)
1943                 agaw = gaw;
1944         else
1945                 agaw = gaw + 9 - r;
1946         if (agaw > 64)
1947                 agaw = 64;
1948         return agaw;
1949 }
1950
1951 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1952                        int guest_width)
1953 {
1954         int adjust_width, agaw;
1955         unsigned long sagaw;
1956         int err;
1957
1958         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
1959                         DMA_32BIT_PFN);
1960
1961         err = init_iova_flush_queue(&domain->iovad,
1962                                     iommu_flush_iova, iova_entry_free);
1963         if (err)
1964                 return err;
1965
1966         domain_reserve_special_ranges(domain);
1967
1968         /* calculate AGAW */
1969         if (guest_width > cap_mgaw(iommu->cap))
1970                 guest_width = cap_mgaw(iommu->cap);
1971         domain->gaw = guest_width;
1972         adjust_width = guestwidth_to_adjustwidth(guest_width);
1973         agaw = width_to_agaw(adjust_width);
1974         sagaw = cap_sagaw(iommu->cap);
1975         if (!test_bit(agaw, &sagaw)) {
1976                 /* hardware doesn't support it, choose a bigger one */
1977                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1978                 agaw = find_next_bit(&sagaw, 5, agaw);
1979                 if (agaw >= 5)
1980                         return -ENODEV;
1981         }
1982         domain->agaw = agaw;
1983
1984         if (ecap_coherent(iommu->ecap))
1985                 domain->iommu_coherency = 1;
1986         else
1987                 domain->iommu_coherency = 0;
1988
1989         if (ecap_sc_support(iommu->ecap))
1990                 domain->iommu_snooping = 1;
1991         else
1992                 domain->iommu_snooping = 0;
1993
1994         if (intel_iommu_superpage)
1995                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1996         else
1997                 domain->iommu_superpage = 0;
1998
1999         domain->nid = iommu->node;
2000
2001         /* always allocate the top pgd */
2002         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
2003         if (!domain->pgd)
2004                 return -ENOMEM;
2005         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
2006         return 0;
2007 }
2008
2009 static void domain_exit(struct dmar_domain *domain)
2010 {
2011         struct page *freelist = NULL;
2012
2013         /* Domain 0 is reserved, so dont process it */
2014         if (!domain)
2015                 return;
2016
2017         /* Remove associated devices and clear attached or cached domains */
2018         rcu_read_lock();
2019         domain_remove_dev_info(domain);
2020         rcu_read_unlock();
2021
2022         /* destroy iovas */
2023         put_iova_domain(&domain->iovad);
2024
2025         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2026
2027         dma_free_pagelist(freelist);
2028
2029         free_domain_mem(domain);
2030 }
2031
2032 static int domain_context_mapping_one(struct dmar_domain *domain,
2033                                       struct intel_iommu *iommu,
2034                                       u8 bus, u8 devfn)
2035 {
2036         u16 did = domain->iommu_did[iommu->seq_id];
2037         int translation = CONTEXT_TT_MULTI_LEVEL;
2038         struct device_domain_info *info = NULL;
2039         struct context_entry *context;
2040         unsigned long flags;
2041         struct dma_pte *pgd;
2042         int ret, agaw;
2043
2044         WARN_ON(did == 0);
2045
2046         if (hw_pass_through && domain_type_is_si(domain))
2047                 translation = CONTEXT_TT_PASS_THROUGH;
2048
2049         pr_debug("Set context mapping for %02x:%02x.%d\n",
2050                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2051
2052         BUG_ON(!domain->pgd);
2053
2054         spin_lock_irqsave(&device_domain_lock, flags);
2055         spin_lock(&iommu->lock);
2056
2057         ret = -ENOMEM;
2058         context = iommu_context_addr(iommu, bus, devfn, 1);
2059         if (!context)
2060                 goto out_unlock;
2061
2062         ret = 0;
2063         if (context_present(context))
2064                 goto out_unlock;
2065
2066         /*
2067          * For kdump cases, old valid entries may be cached due to the
2068          * in-flight DMA and copied pgtable, but there is no unmapping
2069          * behaviour for them, thus we need an explicit cache flush for
2070          * the newly-mapped device. For kdump, at this point, the device
2071          * is supposed to finish reset at its driver probe stage, so no
2072          * in-flight DMA will exist, and we don't need to worry anymore
2073          * hereafter.
2074          */
2075         if (context_copied(context)) {
2076                 u16 did_old = context_domain_id(context);
2077
2078                 if (did_old >= 0 && did_old < cap_ndoms(iommu->cap)) {
2079                         iommu->flush.flush_context(iommu, did_old,
2080                                                    (((u16)bus) << 8) | devfn,
2081                                                    DMA_CCMD_MASK_NOBIT,
2082                                                    DMA_CCMD_DEVICE_INVL);
2083                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2084                                                  DMA_TLB_DSI_FLUSH);
2085                 }
2086         }
2087
2088         pgd = domain->pgd;
2089
2090         context_clear_entry(context);
2091         context_set_domain_id(context, did);
2092
2093         /*
2094          * Skip top levels of page tables for iommu which has less agaw
2095          * than default.  Unnecessary for PT mode.
2096          */
2097         if (translation != CONTEXT_TT_PASS_THROUGH) {
2098                 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2099                         ret = -ENOMEM;
2100                         pgd = phys_to_virt(dma_pte_addr(pgd));
2101                         if (!dma_pte_present(pgd))
2102                                 goto out_unlock;
2103                 }
2104
2105                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2106                 if (info && info->ats_supported)
2107                         translation = CONTEXT_TT_DEV_IOTLB;
2108                 else
2109                         translation = CONTEXT_TT_MULTI_LEVEL;
2110
2111                 context_set_address_root(context, virt_to_phys(pgd));
2112                 context_set_address_width(context, agaw);
2113         } else {
2114                 /*
2115                  * In pass through mode, AW must be programmed to
2116                  * indicate the largest AGAW value supported by
2117                  * hardware. And ASR is ignored by hardware.
2118                  */
2119                 context_set_address_width(context, iommu->msagaw);
2120         }
2121
2122         context_set_translation_type(context, translation);
2123         context_set_fault_enable(context);
2124         context_set_present(context);
2125         domain_flush_cache(domain, context, sizeof(*context));
2126
2127         /*
2128          * It's a non-present to present mapping. If hardware doesn't cache
2129          * non-present entry we only need to flush the write-buffer. If the
2130          * _does_ cache non-present entries, then it does so in the special
2131          * domain #0, which we have to flush:
2132          */
2133         if (cap_caching_mode(iommu->cap)) {
2134                 iommu->flush.flush_context(iommu, 0,
2135                                            (((u16)bus) << 8) | devfn,
2136                                            DMA_CCMD_MASK_NOBIT,
2137                                            DMA_CCMD_DEVICE_INVL);
2138                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2139         } else {
2140                 iommu_flush_write_buffer(iommu);
2141         }
2142         iommu_enable_dev_iotlb(info);
2143
2144         ret = 0;
2145
2146 out_unlock:
2147         spin_unlock(&iommu->lock);
2148         spin_unlock_irqrestore(&device_domain_lock, flags);
2149
2150         return ret;
2151 }
2152
2153 struct domain_context_mapping_data {
2154         struct dmar_domain *domain;
2155         struct intel_iommu *iommu;
2156 };
2157
2158 static int domain_context_mapping_cb(struct pci_dev *pdev,
2159                                      u16 alias, void *opaque)
2160 {
2161         struct domain_context_mapping_data *data = opaque;
2162
2163         return domain_context_mapping_one(data->domain, data->iommu,
2164                                           PCI_BUS_NUM(alias), alias & 0xff);
2165 }
2166
2167 static int
2168 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2169 {
2170         struct intel_iommu *iommu;
2171         u8 bus, devfn;
2172         struct domain_context_mapping_data data;
2173
2174         iommu = device_to_iommu(dev, &bus, &devfn);
2175         if (!iommu)
2176                 return -ENODEV;
2177
2178         if (!dev_is_pci(dev))
2179                 return domain_context_mapping_one(domain, iommu, bus, devfn);
2180
2181         data.domain = domain;
2182         data.iommu = iommu;
2183
2184         return pci_for_each_dma_alias(to_pci_dev(dev),
2185                                       &domain_context_mapping_cb, &data);
2186 }
2187
2188 static int domain_context_mapped_cb(struct pci_dev *pdev,
2189                                     u16 alias, void *opaque)
2190 {
2191         struct intel_iommu *iommu = opaque;
2192
2193         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2194 }
2195
2196 static int domain_context_mapped(struct device *dev)
2197 {
2198         struct intel_iommu *iommu;
2199         u8 bus, devfn;
2200
2201         iommu = device_to_iommu(dev, &bus, &devfn);
2202         if (!iommu)
2203                 return -ENODEV;
2204
2205         if (!dev_is_pci(dev))
2206                 return device_context_mapped(iommu, bus, devfn);
2207
2208         return !pci_for_each_dma_alias(to_pci_dev(dev),
2209                                        domain_context_mapped_cb, iommu);
2210 }
2211
2212 /* Returns a number of VTD pages, but aligned to MM page size */
2213 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2214                                             size_t size)
2215 {
2216         host_addr &= ~PAGE_MASK;
2217         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2218 }
2219
2220 /* Return largest possible superpage level for a given mapping */
2221 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2222                                           unsigned long iov_pfn,
2223                                           unsigned long phy_pfn,
2224                                           unsigned long pages)
2225 {
2226         int support, level = 1;
2227         unsigned long pfnmerge;
2228
2229         support = domain->iommu_superpage;
2230
2231         /* To use a large page, the virtual *and* physical addresses
2232            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2233            of them will mean we have to use smaller pages. So just
2234            merge them and check both at once. */
2235         pfnmerge = iov_pfn | phy_pfn;
2236
2237         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2238                 pages >>= VTD_STRIDE_SHIFT;
2239                 if (!pages)
2240                         break;
2241                 pfnmerge >>= VTD_STRIDE_SHIFT;
2242                 level++;
2243                 support--;
2244         }
2245         return level;
2246 }
2247
2248 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2249                             struct scatterlist *sg, unsigned long phys_pfn,
2250                             unsigned long nr_pages, int prot)
2251 {
2252         struct dma_pte *first_pte = NULL, *pte = NULL;
2253         phys_addr_t uninitialized_var(pteval);
2254         unsigned long sg_res = 0;
2255         unsigned int largepage_lvl = 0;
2256         unsigned long lvl_pages = 0;
2257
2258         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2259
2260         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2261                 return -EINVAL;
2262
2263         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2264
2265         if (!sg) {
2266                 sg_res = nr_pages;
2267                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2268         }
2269
2270         while (nr_pages > 0) {
2271                 uint64_t tmp;
2272
2273                 if (!sg_res) {
2274                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2275
2276                         sg_res = aligned_nrpages(sg->offset, sg->length);
2277                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2278                         sg->dma_length = sg->length;
2279                         pteval = (sg_phys(sg) - pgoff) | prot;
2280                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2281                 }
2282
2283                 if (!pte) {
2284                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2285
2286                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2287                         if (!pte)
2288                                 return -ENOMEM;
2289                         /* It is large page*/
2290                         if (largepage_lvl > 1) {
2291                                 unsigned long nr_superpages, end_pfn;
2292
2293                                 pteval |= DMA_PTE_LARGE_PAGE;
2294                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2295
2296                                 nr_superpages = sg_res / lvl_pages;
2297                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2298
2299                                 /*
2300                                  * Ensure that old small page tables are
2301                                  * removed to make room for superpage(s).
2302                                  * We're adding new large pages, so make sure
2303                                  * we don't remove their parent tables.
2304                                  */
2305                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2306                                                        largepage_lvl + 1);
2307                         } else {
2308                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2309                         }
2310
2311                 }
2312                 /* We don't need lock here, nobody else
2313                  * touches the iova range
2314                  */
2315                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2316                 if (tmp) {
2317                         static int dumps = 5;
2318                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2319                                 iov_pfn, tmp, (unsigned long long)pteval);
2320                         if (dumps) {
2321                                 dumps--;
2322                                 debug_dma_dump_mappings(NULL);
2323                         }
2324                         WARN_ON(1);
2325                 }
2326
2327                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2328
2329                 BUG_ON(nr_pages < lvl_pages);
2330                 BUG_ON(sg_res < lvl_pages);
2331
2332                 nr_pages -= lvl_pages;
2333                 iov_pfn += lvl_pages;
2334                 phys_pfn += lvl_pages;
2335                 pteval += lvl_pages * VTD_PAGE_SIZE;
2336                 sg_res -= lvl_pages;
2337
2338                 /* If the next PTE would be the first in a new page, then we
2339                    need to flush the cache on the entries we've just written.
2340                    And then we'll need to recalculate 'pte', so clear it and
2341                    let it get set again in the if (!pte) block above.
2342
2343                    If we're done (!nr_pages) we need to flush the cache too.
2344
2345                    Also if we've been setting superpages, we may need to
2346                    recalculate 'pte' and switch back to smaller pages for the
2347                    end of the mapping, if the trailing size is not enough to
2348                    use another superpage (i.e. sg_res < lvl_pages). */
2349                 pte++;
2350                 if (!nr_pages || first_pte_in_page(pte) ||
2351                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2352                         domain_flush_cache(domain, first_pte,
2353                                            (void *)pte - (void *)first_pte);
2354                         pte = NULL;
2355                 }
2356
2357                 if (!sg_res && nr_pages)
2358                         sg = sg_next(sg);
2359         }
2360         return 0;
2361 }
2362
2363 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2364                                     struct scatterlist *sg, unsigned long nr_pages,
2365                                     int prot)
2366 {
2367         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2368 }
2369
2370 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2371                                      unsigned long phys_pfn, unsigned long nr_pages,
2372                                      int prot)
2373 {
2374         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2375 }
2376
2377 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2378 {
2379         unsigned long flags;
2380         struct context_entry *context;
2381         u16 did_old;
2382
2383         if (!iommu)
2384                 return;
2385
2386         spin_lock_irqsave(&iommu->lock, flags);
2387         context = iommu_context_addr(iommu, bus, devfn, 0);
2388         if (!context) {
2389                 spin_unlock_irqrestore(&iommu->lock, flags);
2390                 return;
2391         }
2392         did_old = context_domain_id(context);
2393         context_clear_entry(context);
2394         __iommu_flush_cache(iommu, context, sizeof(*context));
2395         spin_unlock_irqrestore(&iommu->lock, flags);
2396         iommu->flush.flush_context(iommu,
2397                                    did_old,
2398                                    (((u16)bus) << 8) | devfn,
2399                                    DMA_CCMD_MASK_NOBIT,
2400                                    DMA_CCMD_DEVICE_INVL);
2401         iommu->flush.flush_iotlb(iommu,
2402                                  did_old,
2403                                  0,
2404                                  0,
2405                                  DMA_TLB_DSI_FLUSH);
2406 }
2407
2408 static inline void unlink_domain_info(struct device_domain_info *info)
2409 {
2410         assert_spin_locked(&device_domain_lock);
2411         list_del(&info->link);
2412         list_del(&info->global);
2413         if (info->dev)
2414                 info->dev->archdata.iommu = NULL;
2415 }
2416
2417 static void domain_remove_dev_info(struct dmar_domain *domain)
2418 {
2419         struct device_domain_info *info, *tmp;
2420         unsigned long flags;
2421
2422         spin_lock_irqsave(&device_domain_lock, flags);
2423         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2424                 __dmar_remove_one_dev_info(info);
2425         spin_unlock_irqrestore(&device_domain_lock, flags);
2426 }
2427
2428 /*
2429  * find_domain
2430  * Note: we use struct device->archdata.iommu stores the info
2431  */
2432 static struct dmar_domain *find_domain(struct device *dev)
2433 {
2434         struct device_domain_info *info;
2435
2436         /* No lock here, assumes no domain exit in normal case */
2437         info = dev->archdata.iommu;
2438         if (likely(info))
2439                 return info->domain;
2440         return NULL;
2441 }
2442
2443 static inline struct device_domain_info *
2444 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2445 {
2446         struct device_domain_info *info;
2447
2448         list_for_each_entry(info, &device_domain_list, global)
2449                 if (info->iommu->segment == segment && info->bus == bus &&
2450                     info->devfn == devfn)
2451                         return info;
2452
2453         return NULL;
2454 }
2455
2456 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2457                                                     int bus, int devfn,
2458                                                     struct device *dev,
2459                                                     struct dmar_domain *domain)
2460 {
2461         struct dmar_domain *found = NULL;
2462         struct device_domain_info *info;
2463         unsigned long flags;
2464         int ret;
2465
2466         info = alloc_devinfo_mem();
2467         if (!info)
2468                 return NULL;
2469
2470         info->bus = bus;
2471         info->devfn = devfn;
2472         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2473         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2474         info->ats_qdep = 0;
2475         info->dev = dev;
2476         info->domain = domain;
2477         info->iommu = iommu;
2478
2479         if (dev && dev_is_pci(dev)) {
2480                 struct pci_dev *pdev = to_pci_dev(info->dev);
2481
2482                 if (ecap_dev_iotlb_support(iommu->ecap) &&
2483                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2484                     dmar_find_matched_atsr_unit(pdev))
2485                         info->ats_supported = 1;
2486
2487                 if (ecs_enabled(iommu)) {
2488                         if (pasid_enabled(iommu)) {
2489                                 int features = pci_pasid_features(pdev);
2490                                 if (features >= 0)
2491                                         info->pasid_supported = features | 1;
2492                         }
2493
2494                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2495                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2496                                 info->pri_supported = 1;
2497                 }
2498         }
2499
2500         spin_lock_irqsave(&device_domain_lock, flags);
2501         if (dev)
2502                 found = find_domain(dev);
2503
2504         if (!found) {
2505                 struct device_domain_info *info2;
2506                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2507                 if (info2) {
2508                         found      = info2->domain;
2509                         info2->dev = dev;
2510                 }
2511         }
2512
2513         if (found) {
2514                 spin_unlock_irqrestore(&device_domain_lock, flags);
2515                 free_devinfo_mem(info);
2516                 /* Caller must free the original domain */
2517                 return found;
2518         }
2519
2520         spin_lock(&iommu->lock);
2521         ret = domain_attach_iommu(domain, iommu);
2522         spin_unlock(&iommu->lock);
2523
2524         if (ret) {
2525                 spin_unlock_irqrestore(&device_domain_lock, flags);
2526                 free_devinfo_mem(info);
2527                 return NULL;
2528         }
2529
2530         list_add(&info->link, &domain->devices);
2531         list_add(&info->global, &device_domain_list);
2532         if (dev)
2533                 dev->archdata.iommu = info;
2534         spin_unlock_irqrestore(&device_domain_lock, flags);
2535
2536         if (dev && domain_context_mapping(domain, dev)) {
2537                 pr_err("Domain context map for %s failed\n", dev_name(dev));
2538                 dmar_remove_one_dev_info(domain, dev);
2539                 return NULL;
2540         }
2541
2542         return domain;
2543 }
2544
2545 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2546 {
2547         *(u16 *)opaque = alias;
2548         return 0;
2549 }
2550
2551 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2552 {
2553         struct device_domain_info *info = NULL;
2554         struct dmar_domain *domain = NULL;
2555         struct intel_iommu *iommu;
2556         u16 req_id, dma_alias;
2557         unsigned long flags;
2558         u8 bus, devfn;
2559
2560         iommu = device_to_iommu(dev, &bus, &devfn);
2561         if (!iommu)
2562                 return NULL;
2563
2564         req_id = ((u16)bus << 8) | devfn;
2565
2566         if (dev_is_pci(dev)) {
2567                 struct pci_dev *pdev = to_pci_dev(dev);
2568
2569                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2570
2571                 spin_lock_irqsave(&device_domain_lock, flags);
2572                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2573                                                       PCI_BUS_NUM(dma_alias),
2574                                                       dma_alias & 0xff);
2575                 if (info) {
2576                         iommu = info->iommu;
2577                         domain = info->domain;
2578                 }
2579                 spin_unlock_irqrestore(&device_domain_lock, flags);
2580
2581                 /* DMA alias already has a domain, use it */
2582                 if (info)
2583                         goto out;
2584         }
2585
2586         /* Allocate and initialize new domain for the device */
2587         domain = alloc_domain(0);
2588         if (!domain)
2589                 return NULL;
2590         if (domain_init(domain, iommu, gaw)) {
2591                 domain_exit(domain);
2592                 return NULL;
2593         }
2594
2595 out:
2596
2597         return domain;
2598 }
2599
2600 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2601                                               struct dmar_domain *domain)
2602 {
2603         struct intel_iommu *iommu;
2604         struct dmar_domain *tmp;
2605         u16 req_id, dma_alias;
2606         u8 bus, devfn;
2607
2608         iommu = device_to_iommu(dev, &bus, &devfn);
2609         if (!iommu)
2610                 return NULL;
2611
2612         req_id = ((u16)bus << 8) | devfn;
2613
2614         if (dev_is_pci(dev)) {
2615                 struct pci_dev *pdev = to_pci_dev(dev);
2616
2617                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2618
2619                 /* register PCI DMA alias device */
2620                 if (req_id != dma_alias) {
2621                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2622                                         dma_alias & 0xff, NULL, domain);
2623
2624                         if (!tmp || tmp != domain)
2625                                 return tmp;
2626                 }
2627         }
2628
2629         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2630         if (!tmp || tmp != domain)
2631                 return tmp;
2632
2633         return domain;
2634 }
2635
2636 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2637 {
2638         struct dmar_domain *domain, *tmp;
2639
2640         domain = find_domain(dev);
2641         if (domain)
2642                 goto out;
2643
2644         domain = find_or_alloc_domain(dev, gaw);
2645         if (!domain)
2646                 goto out;
2647
2648         tmp = set_domain_for_dev(dev, domain);
2649         if (!tmp || domain != tmp) {
2650                 domain_exit(domain);
2651                 domain = tmp;
2652         }
2653
2654 out:
2655
2656         return domain;
2657 }
2658
2659 static int iommu_domain_identity_map(struct dmar_domain *domain,
2660                                      unsigned long long start,
2661                                      unsigned long long end)
2662 {
2663         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2664         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2665
2666         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2667                           dma_to_mm_pfn(last_vpfn))) {
2668                 pr_err("Reserving iova failed\n");
2669                 return -ENOMEM;
2670         }
2671
2672         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2673         /*
2674          * RMRR range might have overlap with physical memory range,
2675          * clear it first
2676          */
2677         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2678
2679         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2680                                   last_vpfn - first_vpfn + 1,
2681                                   DMA_PTE_READ|DMA_PTE_WRITE);
2682 }
2683
2684 static int domain_prepare_identity_map(struct device *dev,
2685                                        struct dmar_domain *domain,
2686                                        unsigned long long start,
2687                                        unsigned long long end)
2688 {
2689         /* For _hardware_ passthrough, don't bother. But for software
2690            passthrough, we do it anyway -- it may indicate a memory
2691            range which is reserved in E820, so which didn't get set
2692            up to start with in si_domain */
2693         if (domain == si_domain && hw_pass_through) {
2694                 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2695                         dev_name(dev), start, end);
2696                 return 0;
2697         }
2698
2699         pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2700                 dev_name(dev), start, end);
2701
2702         if (end < start) {
2703                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2704                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2705                         dmi_get_system_info(DMI_BIOS_VENDOR),
2706                         dmi_get_system_info(DMI_BIOS_VERSION),
2707                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2708                 return -EIO;
2709         }
2710
2711         if (end >> agaw_to_width(domain->agaw)) {
2712                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2713                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2714                      agaw_to_width(domain->agaw),
2715                      dmi_get_system_info(DMI_BIOS_VENDOR),
2716                      dmi_get_system_info(DMI_BIOS_VERSION),
2717                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2718                 return -EIO;
2719         }
2720
2721         return iommu_domain_identity_map(domain, start, end);
2722 }
2723
2724 static int iommu_prepare_identity_map(struct device *dev,
2725                                       unsigned long long start,
2726                                       unsigned long long end)
2727 {
2728         struct dmar_domain *domain;
2729         int ret;
2730
2731         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2732         if (!domain)
2733                 return -ENOMEM;
2734
2735         ret = domain_prepare_identity_map(dev, domain, start, end);
2736         if (ret)
2737                 domain_exit(domain);
2738
2739         return ret;
2740 }
2741
2742 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2743                                          struct device *dev)
2744 {
2745         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2746                 return 0;
2747         return iommu_prepare_identity_map(dev, rmrr->base_address,
2748                                           rmrr->end_address);
2749 }
2750
2751 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2752 static inline void iommu_prepare_isa(void)
2753 {
2754         struct pci_dev *pdev;
2755         int ret;
2756
2757         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2758         if (!pdev)
2759                 return;
2760
2761         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2762         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2763
2764         if (ret)
2765                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2766
2767         pci_dev_put(pdev);
2768 }
2769 #else
2770 static inline void iommu_prepare_isa(void)
2771 {
2772         return;
2773 }
2774 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2775
2776 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2777
2778 static int __init si_domain_init(int hw)
2779 {
2780         int nid, ret = 0;
2781
2782         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2783         if (!si_domain)
2784                 return -EFAULT;
2785
2786         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2787                 domain_exit(si_domain);
2788                 si_domain = NULL;
2789                 return -EFAULT;
2790         }
2791
2792         pr_debug("Identity mapping domain allocated\n");
2793
2794         if (hw)
2795                 return 0;
2796
2797         for_each_online_node(nid) {
2798                 unsigned long start_pfn, end_pfn;
2799                 int i;
2800
2801                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2802                         ret = iommu_domain_identity_map(si_domain,
2803                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2804                         if (ret)
2805                                 return ret;
2806                 }
2807         }
2808
2809         return 0;
2810 }
2811
2812 static int identity_mapping(struct device *dev)
2813 {
2814         struct device_domain_info *info;
2815
2816         if (likely(!iommu_identity_mapping))
2817                 return 0;
2818
2819         info = dev->archdata.iommu;
2820         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2821                 return (info->domain == si_domain);
2822
2823         return 0;
2824 }
2825
2826 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2827 {
2828         struct dmar_domain *ndomain;
2829         struct intel_iommu *iommu;
2830         u8 bus, devfn;
2831
2832         iommu = device_to_iommu(dev, &bus, &devfn);
2833         if (!iommu)
2834                 return -ENODEV;
2835
2836         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2837         if (ndomain != domain)
2838                 return -EBUSY;
2839
2840         return 0;
2841 }
2842
2843 static bool device_has_rmrr(struct device *dev)
2844 {
2845         struct dmar_rmrr_unit *rmrr;
2846         struct device *tmp;
2847         int i;
2848
2849         rcu_read_lock();
2850         for_each_rmrr_units(rmrr) {
2851                 /*
2852                  * Return TRUE if this RMRR contains the device that
2853                  * is passed in.
2854                  */
2855                 for_each_active_dev_scope(rmrr->devices,
2856                                           rmrr->devices_cnt, i, tmp)
2857                         if (tmp == dev) {
2858                                 rcu_read_unlock();
2859                                 return true;
2860                         }
2861         }
2862         rcu_read_unlock();
2863         return false;
2864 }
2865
2866 /*
2867  * There are a couple cases where we need to restrict the functionality of
2868  * devices associated with RMRRs.  The first is when evaluating a device for
2869  * identity mapping because problems exist when devices are moved in and out
2870  * of domains and their respective RMRR information is lost.  This means that
2871  * a device with associated RMRRs will never be in a "passthrough" domain.
2872  * The second is use of the device through the IOMMU API.  This interface
2873  * expects to have full control of the IOVA space for the device.  We cannot
2874  * satisfy both the requirement that RMRR access is maintained and have an
2875  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2876  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2877  * We therefore prevent devices associated with an RMRR from participating in
2878  * the IOMMU API, which eliminates them from device assignment.
2879  *
2880  * In both cases we assume that PCI USB devices with RMRRs have them largely
2881  * for historical reasons and that the RMRR space is not actively used post
2882  * boot.  This exclusion may change if vendors begin to abuse it.
2883  *
2884  * The same exception is made for graphics devices, with the requirement that
2885  * any use of the RMRR regions will be torn down before assigning the device
2886  * to a guest.
2887  */
2888 static bool device_is_rmrr_locked(struct device *dev)
2889 {
2890         if (!device_has_rmrr(dev))
2891                 return false;
2892
2893         if (dev_is_pci(dev)) {
2894                 struct pci_dev *pdev = to_pci_dev(dev);
2895
2896                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2897                         return false;
2898         }
2899
2900         return true;
2901 }
2902
2903 static int iommu_should_identity_map(struct device *dev, int startup)
2904 {
2905
2906         if (dev_is_pci(dev)) {
2907                 struct pci_dev *pdev = to_pci_dev(dev);
2908
2909                 if (device_is_rmrr_locked(dev))
2910                         return 0;
2911
2912                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2913                         return 1;
2914
2915                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2916                         return 1;
2917
2918                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2919                         return 0;
2920
2921                 /*
2922                  * We want to start off with all devices in the 1:1 domain, and
2923                  * take them out later if we find they can't access all of memory.
2924                  *
2925                  * However, we can't do this for PCI devices behind bridges,
2926                  * because all PCI devices behind the same bridge will end up
2927                  * with the same source-id on their transactions.
2928                  *
2929                  * Practically speaking, we can't change things around for these
2930                  * devices at run-time, because we can't be sure there'll be no
2931                  * DMA transactions in flight for any of their siblings.
2932                  *
2933                  * So PCI devices (unless they're on the root bus) as well as
2934                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2935                  * the 1:1 domain, just in _case_ one of their siblings turns out
2936                  * not to be able to map all of memory.
2937                  */
2938                 if (!pci_is_pcie(pdev)) {
2939                         if (!pci_is_root_bus(pdev->bus))
2940                                 return 0;
2941                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2942                                 return 0;
2943                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2944                         return 0;
2945         } else {
2946                 if (device_has_rmrr(dev))
2947                         return 0;
2948         }
2949
2950         /*
2951          * At boot time, we don't yet know if devices will be 64-bit capable.
2952          * Assume that they will — if they turn out not to be, then we can
2953          * take them out of the 1:1 domain later.
2954          */
2955         if (!startup) {
2956                 /*
2957                  * If the device's dma_mask is less than the system's memory
2958                  * size then this is not a candidate for identity mapping.
2959                  */
2960                 u64 dma_mask = *dev->dma_mask;
2961
2962                 if (dev->coherent_dma_mask &&
2963                     dev->coherent_dma_mask < dma_mask)
2964                         dma_mask = dev->coherent_dma_mask;
2965
2966                 return dma_mask >= dma_get_required_mask(dev);
2967         }
2968
2969         return 1;
2970 }
2971
2972 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2973 {
2974         int ret;
2975
2976         if (!iommu_should_identity_map(dev, 1))
2977                 return 0;
2978
2979         ret = domain_add_dev_info(si_domain, dev);
2980         if (!ret)
2981                 pr_info("%s identity mapping for device %s\n",
2982                         hw ? "Hardware" : "Software", dev_name(dev));
2983         else if (ret == -ENODEV)
2984                 /* device not associated with an iommu */
2985                 ret = 0;
2986
2987         return ret;
2988 }
2989
2990
2991 static int __init iommu_prepare_static_identity_mapping(int hw)
2992 {
2993         struct pci_dev *pdev = NULL;
2994         struct dmar_drhd_unit *drhd;
2995         struct intel_iommu *iommu;
2996         struct device *dev;
2997         int i;
2998         int ret = 0;
2999
3000         for_each_pci_dev(pdev) {
3001                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
3002                 if (ret)
3003                         return ret;
3004         }
3005
3006         for_each_active_iommu(iommu, drhd)
3007                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
3008                         struct acpi_device_physical_node *pn;
3009                         struct acpi_device *adev;
3010
3011                         if (dev->bus != &acpi_bus_type)
3012                                 continue;
3013
3014                         adev= to_acpi_device(dev);
3015                         mutex_lock(&adev->physical_node_lock);
3016                         list_for_each_entry(pn, &adev->physical_node_list, node) {
3017                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
3018                                 if (ret)
3019                                         break;
3020                         }
3021                         mutex_unlock(&adev->physical_node_lock);
3022                         if (ret)
3023                                 return ret;
3024                 }
3025
3026         return 0;
3027 }
3028
3029 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3030 {
3031         /*
3032          * Start from the sane iommu hardware state.
3033          * If the queued invalidation is already initialized by us
3034          * (for example, while enabling interrupt-remapping) then
3035          * we got the things already rolling from a sane state.
3036          */
3037         if (!iommu->qi) {
3038                 /*
3039                  * Clear any previous faults.
3040                  */
3041                 dmar_fault(-1, iommu);
3042                 /*
3043                  * Disable queued invalidation if supported and already enabled
3044                  * before OS handover.
3045                  */
3046                 dmar_disable_qi(iommu);
3047         }
3048
3049         if (dmar_enable_qi(iommu)) {
3050                 /*
3051                  * Queued Invalidate not enabled, use Register Based Invalidate
3052                  */
3053                 iommu->flush.flush_context = __iommu_flush_context;
3054                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3055                 pr_info("%s: Using Register based invalidation\n",
3056                         iommu->name);
3057         } else {
3058                 iommu->flush.flush_context = qi_flush_context;
3059                 iommu->flush.flush_iotlb = qi_flush_iotlb;
3060                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3061         }
3062 }
3063
3064 static int copy_context_table(struct intel_iommu *iommu,
3065                               struct root_entry *old_re,
3066                               struct context_entry **tbl,
3067                               int bus, bool ext)
3068 {
3069         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3070         struct context_entry *new_ce = NULL, ce;
3071         struct context_entry *old_ce = NULL;
3072         struct root_entry re;
3073         phys_addr_t old_ce_phys;
3074
3075         tbl_idx = ext ? bus * 2 : bus;
3076         memcpy(&re, old_re, sizeof(re));
3077
3078         for (devfn = 0; devfn < 256; devfn++) {
3079                 /* First calculate the correct index */
3080                 idx = (ext ? devfn * 2 : devfn) % 256;
3081
3082                 if (idx == 0) {
3083                         /* First save what we may have and clean up */
3084                         if (new_ce) {
3085                                 tbl[tbl_idx] = new_ce;
3086                                 __iommu_flush_cache(iommu, new_ce,
3087                                                     VTD_PAGE_SIZE);
3088                                 pos = 1;
3089                         }
3090
3091                         if (old_ce)
3092                                 memunmap(old_ce);
3093
3094                         ret = 0;
3095                         if (devfn < 0x80)
3096                                 old_ce_phys = root_entry_lctp(&re);
3097                         else
3098                                 old_ce_phys = root_entry_uctp(&re);
3099
3100                         if (!old_ce_phys) {
3101                                 if (ext && devfn == 0) {
3102                                         /* No LCTP, try UCTP */
3103                                         devfn = 0x7f;
3104                                         continue;
3105                                 } else {
3106                                         goto out;
3107                                 }
3108                         }
3109
3110                         ret = -ENOMEM;
3111                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3112                                         MEMREMAP_WB);
3113                         if (!old_ce)
3114                                 goto out;
3115
3116                         new_ce = alloc_pgtable_page(iommu->node);
3117                         if (!new_ce)
3118                                 goto out_unmap;
3119
3120                         ret = 0;
3121                 }
3122
3123                 /* Now copy the context entry */
3124                 memcpy(&ce, old_ce + idx, sizeof(ce));
3125
3126                 if (!__context_present(&ce))
3127                         continue;
3128
3129                 did = context_domain_id(&ce);
3130                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3131                         set_bit(did, iommu->domain_ids);
3132
3133                 /*
3134                  * We need a marker for copied context entries. This
3135                  * marker needs to work for the old format as well as
3136                  * for extended context entries.
3137                  *
3138                  * Bit 67 of the context entry is used. In the old
3139                  * format this bit is available to software, in the
3140                  * extended format it is the PGE bit, but PGE is ignored
3141                  * by HW if PASIDs are disabled (and thus still
3142                  * available).
3143                  *
3144                  * So disable PASIDs first and then mark the entry
3145                  * copied. This means that we don't copy PASID
3146                  * translations from the old kernel, but this is fine as
3147                  * faults there are not fatal.
3148                  */
3149                 context_clear_pasid_enable(&ce);
3150                 context_set_copied(&ce);
3151
3152                 new_ce[idx] = ce;
3153         }
3154
3155         tbl[tbl_idx + pos] = new_ce;
3156
3157         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3158
3159 out_unmap:
3160         memunmap(old_ce);
3161
3162 out:
3163         return ret;
3164 }
3165
3166 static int copy_translation_tables(struct intel_iommu *iommu)
3167 {
3168         struct context_entry **ctxt_tbls;
3169         struct root_entry *old_rt;
3170         phys_addr_t old_rt_phys;
3171         int ctxt_table_entries;
3172         unsigned long flags;
3173         u64 rtaddr_reg;
3174         int bus, ret;
3175         bool new_ext, ext;
3176
3177         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3178         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3179         new_ext    = !!ecap_ecs(iommu->ecap);
3180
3181         /*
3182          * The RTT bit can only be changed when translation is disabled,
3183          * but disabling translation means to open a window for data
3184          * corruption. So bail out and don't copy anything if we would
3185          * have to change the bit.
3186          */
3187         if (new_ext != ext)
3188                 return -EINVAL;
3189
3190         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3191         if (!old_rt_phys)
3192                 return -EINVAL;
3193
3194         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3195         if (!old_rt)
3196                 return -ENOMEM;
3197
3198         /* This is too big for the stack - allocate it from slab */
3199         ctxt_table_entries = ext ? 512 : 256;
3200         ret = -ENOMEM;
3201         ctxt_tbls = kzalloc(ctxt_table_entries * sizeof(void *), GFP_KERNEL);
3202         if (!ctxt_tbls)
3203                 goto out_unmap;
3204
3205         for (bus = 0; bus < 256; bus++) {
3206                 ret = copy_context_table(iommu, &old_rt[bus],
3207                                          ctxt_tbls, bus, ext);
3208                 if (ret) {
3209                         pr_err("%s: Failed to copy context table for bus %d\n",
3210                                 iommu->name, bus);
3211                         continue;
3212                 }
3213         }
3214
3215         spin_lock_irqsave(&iommu->lock, flags);
3216
3217         /* Context tables are copied, now write them to the root_entry table */
3218         for (bus = 0; bus < 256; bus++) {
3219                 int idx = ext ? bus * 2 : bus;
3220                 u64 val;
3221
3222                 if (ctxt_tbls[idx]) {
3223                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3224                         iommu->root_entry[bus].lo = val;
3225                 }
3226
3227                 if (!ext || !ctxt_tbls[idx + 1])
3228                         continue;
3229
3230                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3231                 iommu->root_entry[bus].hi = val;
3232         }
3233
3234         spin_unlock_irqrestore(&iommu->lock, flags);
3235
3236         kfree(ctxt_tbls);
3237
3238         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3239
3240         ret = 0;
3241
3242 out_unmap:
3243         memunmap(old_rt);
3244
3245         return ret;
3246 }
3247
3248 static int __init init_dmars(void)
3249 {
3250         struct dmar_drhd_unit *drhd;
3251         struct dmar_rmrr_unit *rmrr;
3252         bool copied_tables = false;
3253         struct device *dev;
3254         struct intel_iommu *iommu;
3255         int i, ret;
3256
3257         /*
3258          * for each drhd
3259          *    allocate root
3260          *    initialize and program root entry to not present
3261          * endfor
3262          */
3263         for_each_drhd_unit(drhd) {
3264                 /*
3265                  * lock not needed as this is only incremented in the single
3266                  * threaded kernel __init code path all other access are read
3267                  * only
3268                  */
3269                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3270                         g_num_of_iommus++;
3271                         continue;
3272                 }
3273                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3274         }
3275
3276         /* Preallocate enough resources for IOMMU hot-addition */
3277         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3278                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3279
3280         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3281                         GFP_KERNEL);
3282         if (!g_iommus) {
3283                 pr_err("Allocating global iommu array failed\n");
3284                 ret = -ENOMEM;
3285                 goto error;
3286         }
3287
3288         for_each_active_iommu(iommu, drhd) {
3289                 g_iommus[iommu->seq_id] = iommu;
3290
3291                 intel_iommu_init_qi(iommu);
3292
3293                 ret = iommu_init_domains(iommu);
3294                 if (ret)
3295                         goto free_iommu;
3296
3297                 init_translation_status(iommu);
3298
3299                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3300                         iommu_disable_translation(iommu);
3301                         clear_translation_pre_enabled(iommu);
3302                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3303                                 iommu->name);
3304                 }
3305
3306                 /*
3307                  * TBD:
3308                  * we could share the same root & context tables
3309                  * among all IOMMU's. Need to Split it later.
3310                  */
3311                 ret = iommu_alloc_root_entry(iommu);
3312                 if (ret)
3313                         goto free_iommu;
3314
3315                 if (translation_pre_enabled(iommu)) {
3316                         pr_info("Translation already enabled - trying to copy translation structures\n");
3317
3318                         ret = copy_translation_tables(iommu);
3319                         if (ret) {
3320                                 /*
3321                                  * We found the IOMMU with translation
3322                                  * enabled - but failed to copy over the
3323                                  * old root-entry table. Try to proceed
3324                                  * by disabling translation now and
3325                                  * allocating a clean root-entry table.
3326                                  * This might cause DMAR faults, but
3327                                  * probably the dump will still succeed.
3328                                  */
3329                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3330                                        iommu->name);
3331                                 iommu_disable_translation(iommu);
3332                                 clear_translation_pre_enabled(iommu);
3333                         } else {
3334                                 pr_info("Copied translation tables from previous kernel for %s\n",
3335                                         iommu->name);
3336                                 copied_tables = true;
3337                         }
3338                 }
3339
3340                 if (!ecap_pass_through(iommu->ecap))
3341                         hw_pass_through = 0;
3342
3343                 if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) {
3344                         pr_info("Disable batched IOTLB flush due to virtualization");
3345                         intel_iommu_strict = 1;
3346                 }
3347
3348 #ifdef CONFIG_INTEL_IOMMU_SVM
3349                 if (pasid_enabled(iommu))
3350                         intel_svm_alloc_pasid_tables(iommu);
3351 #endif
3352         }
3353
3354         /*
3355          * Now that qi is enabled on all iommus, set the root entry and flush
3356          * caches. This is required on some Intel X58 chipsets, otherwise the
3357          * flush_context function will loop forever and the boot hangs.
3358          */
3359         for_each_active_iommu(iommu, drhd) {
3360                 iommu_flush_write_buffer(iommu);
3361                 iommu_set_root_entry(iommu);
3362                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3363                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3364         }
3365
3366         if (iommu_pass_through)
3367                 iommu_identity_mapping |= IDENTMAP_ALL;
3368
3369 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3370         dmar_map_gfx = 0;
3371 #endif
3372
3373         if (!dmar_map_gfx)
3374                 iommu_identity_mapping |= IDENTMAP_GFX;
3375
3376         check_tylersburg_isoch();
3377
3378         if (iommu_identity_mapping) {
3379                 ret = si_domain_init(hw_pass_through);
3380                 if (ret)
3381                         goto free_iommu;
3382         }
3383
3384
3385         /*
3386          * If we copied translations from a previous kernel in the kdump
3387          * case, we can not assign the devices to domains now, as that
3388          * would eliminate the old mappings. So skip this part and defer
3389          * the assignment to device driver initialization time.
3390          */
3391         if (copied_tables)
3392                 goto domains_done;
3393
3394         /*
3395          * If pass through is not set or not enabled, setup context entries for
3396          * identity mappings for rmrr, gfx, and isa and may fall back to static
3397          * identity mapping if iommu_identity_mapping is set.
3398          */
3399         if (iommu_identity_mapping) {
3400                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3401                 if (ret) {
3402                         pr_crit("Failed to setup IOMMU pass-through\n");
3403                         goto free_iommu;
3404                 }
3405         }
3406         /*
3407          * For each rmrr
3408          *   for each dev attached to rmrr
3409          *   do
3410          *     locate drhd for dev, alloc domain for dev
3411          *     allocate free domain
3412          *     allocate page table entries for rmrr
3413          *     if context not allocated for bus
3414          *           allocate and init context
3415          *           set present in root table for this bus
3416          *     init context with domain, translation etc
3417          *    endfor
3418          * endfor
3419          */
3420         pr_info("Setting RMRR:\n");
3421         for_each_rmrr_units(rmrr) {
3422                 /* some BIOS lists non-exist devices in DMAR table. */
3423                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3424                                           i, dev) {
3425                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3426                         if (ret)
3427                                 pr_err("Mapping reserved region failed\n");
3428                 }
3429         }
3430
3431         iommu_prepare_isa();
3432
3433 domains_done:
3434
3435         /*
3436          * for each drhd
3437          *   enable fault log
3438          *   global invalidate context cache
3439          *   global invalidate iotlb
3440          *   enable translation
3441          */
3442         for_each_iommu(iommu, drhd) {
3443                 if (drhd->ignored) {
3444                         /*
3445                          * we always have to disable PMRs or DMA may fail on
3446                          * this device
3447                          */
3448                         if (force_on)
3449                                 iommu_disable_protect_mem_regions(iommu);
3450                         continue;
3451                 }
3452
3453                 iommu_flush_write_buffer(iommu);
3454
3455 #ifdef CONFIG_INTEL_IOMMU_SVM
3456                 if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
3457                         ret = intel_svm_enable_prq(iommu);
3458                         if (ret)
3459                                 goto free_iommu;
3460                 }
3461 #endif
3462                 ret = dmar_set_interrupt(iommu);
3463                 if (ret)
3464                         goto free_iommu;
3465
3466                 if (!translation_pre_enabled(iommu))
3467                         iommu_enable_translation(iommu);
3468
3469                 iommu_disable_protect_mem_regions(iommu);
3470         }
3471
3472         return 0;
3473
3474 free_iommu:
3475         for_each_active_iommu(iommu, drhd) {
3476                 disable_dmar_iommu(iommu);
3477                 free_dmar_iommu(iommu);
3478         }
3479         if (si_domain) {
3480                 domain_exit(si_domain);
3481                 si_domain = NULL;
3482         }
3483
3484         kfree(g_iommus);
3485
3486 error:
3487         return ret;
3488 }
3489
3490 /* This takes a number of _MM_ pages, not VTD pages */
3491 static unsigned long intel_alloc_iova(struct device *dev,
3492                                      struct dmar_domain *domain,
3493                                      unsigned long nrpages, uint64_t dma_mask)
3494 {
3495         unsigned long iova_pfn = 0;
3496
3497         /* Restrict dma_mask to the width that the iommu can handle */
3498         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3499         /* Ensure we reserve the whole size-aligned region */
3500         nrpages = __roundup_pow_of_two(nrpages);
3501
3502         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3503                 /*
3504                  * First try to allocate an io virtual address in
3505                  * DMA_BIT_MASK(32) and if that fails then try allocating
3506                  * from higher range
3507                  */
3508                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3509                                            IOVA_PFN(DMA_BIT_MASK(32)));
3510                 if (iova_pfn)
3511                         return iova_pfn;
3512         }
3513         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages, IOVA_PFN(dma_mask));
3514         if (unlikely(!iova_pfn)) {
3515                 pr_err("Allocating %ld-page iova for %s failed",
3516                        nrpages, dev_name(dev));
3517                 return 0;
3518         }
3519
3520         return iova_pfn;
3521 }
3522
3523 static struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3524 {
3525         struct dmar_domain *domain, *tmp;
3526         struct dmar_rmrr_unit *rmrr;
3527         struct device *i_dev;
3528         int i, ret;
3529
3530         domain = find_domain(dev);
3531         if (domain)
3532                 goto out;
3533
3534         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3535         if (!domain)
3536                 goto out;
3537
3538         /* We have a new domain - setup possible RMRRs for the device */
3539         rcu_read_lock();
3540         for_each_rmrr_units(rmrr) {
3541                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3542                                           i, i_dev) {
3543                         if (i_dev != dev)
3544                                 continue;
3545
3546                         ret = domain_prepare_identity_map(dev, domain,
3547                                                           rmrr->base_address,
3548                                                           rmrr->end_address);
3549                         if (ret)
3550                                 dev_err(dev, "Mapping reserved region failed\n");
3551                 }
3552         }
3553         rcu_read_unlock();
3554
3555         tmp = set_domain_for_dev(dev, domain);
3556         if (!tmp || domain != tmp) {
3557                 domain_exit(domain);
3558                 domain = tmp;
3559         }
3560
3561 out:
3562
3563         if (!domain)
3564                 pr_err("Allocating domain for %s failed\n", dev_name(dev));
3565
3566
3567         return domain;
3568 }
3569
3570 /* Check if the dev needs to go through non-identity map and unmap process.*/
3571 static int iommu_no_mapping(struct device *dev)
3572 {
3573         int found;
3574
3575         if (iommu_dummy(dev))
3576                 return 1;
3577
3578         if (!iommu_identity_mapping)
3579                 return 0;
3580
3581         found = identity_mapping(dev);
3582         if (found) {
3583                 if (iommu_should_identity_map(dev, 0))
3584                         return 1;
3585                 else {
3586                         /*
3587                          * 32 bit DMA is removed from si_domain and fall back
3588                          * to non-identity mapping.
3589                          */
3590                         dmar_remove_one_dev_info(si_domain, dev);
3591                         pr_info("32bit %s uses non-identity mapping\n",
3592                                 dev_name(dev));
3593                         return 0;
3594                 }
3595         } else {
3596                 /*
3597                  * In case of a detached 64 bit DMA device from vm, the device
3598                  * is put into si_domain for identity mapping.
3599                  */
3600                 if (iommu_should_identity_map(dev, 0)) {
3601                         int ret;
3602                         ret = domain_add_dev_info(si_domain, dev);
3603                         if (!ret) {
3604                                 pr_info("64bit %s uses identity mapping\n",
3605                                         dev_name(dev));
3606                                 return 1;
3607                         }
3608                 }
3609         }
3610
3611         return 0;
3612 }
3613
3614 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3615                                      size_t size, int dir, u64 dma_mask)
3616 {
3617         struct dmar_domain *domain;
3618         phys_addr_t start_paddr;
3619         unsigned long iova_pfn;
3620         int prot = 0;
3621         int ret;
3622         struct intel_iommu *iommu;
3623         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3624
3625         BUG_ON(dir == DMA_NONE);
3626
3627         if (iommu_no_mapping(dev))
3628                 return paddr;
3629
3630         domain = get_valid_domain_for_dev(dev);
3631         if (!domain)
3632                 return 0;
3633
3634         iommu = domain_get_iommu(domain);
3635         size = aligned_nrpages(paddr, size);
3636
3637         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3638         if (!iova_pfn)
3639                 goto error;
3640
3641         /*
3642          * Check if DMAR supports zero-length reads on write only
3643          * mappings..
3644          */
3645         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3646                         !cap_zlr(iommu->cap))
3647                 prot |= DMA_PTE_READ;
3648         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3649                 prot |= DMA_PTE_WRITE;
3650         /*
3651          * paddr - (paddr + size) might be partial page, we should map the whole
3652          * page.  Note: if two part of one page are separately mapped, we
3653          * might have two guest_addr mapping to the same host paddr, but this
3654          * is not a big problem
3655          */
3656         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3657                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3658         if (ret)
3659                 goto error;
3660
3661         /* it's a non-present to present mapping. Only flush if caching mode */
3662         if (cap_caching_mode(iommu->cap))
3663                 iommu_flush_iotlb_psi(iommu, domain,
3664                                       mm_to_dma_pfn(iova_pfn),
3665                                       size, 0, 1);
3666         else
3667                 iommu_flush_write_buffer(iommu);
3668
3669         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3670         start_paddr += paddr & ~PAGE_MASK;
3671         return start_paddr;
3672
3673 error:
3674         if (iova_pfn)
3675                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3676         pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3677                 dev_name(dev), size, (unsigned long long)paddr, dir);
3678         return 0;
3679 }
3680
3681 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3682                                  unsigned long offset, size_t size,
3683                                  enum dma_data_direction dir,
3684                                  unsigned long attrs)
3685 {
3686         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3687                                   dir, *dev->dma_mask);
3688 }
3689
3690 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3691 {
3692         struct dmar_domain *domain;
3693         unsigned long start_pfn, last_pfn;
3694         unsigned long nrpages;
3695         unsigned long iova_pfn;
3696         struct intel_iommu *iommu;
3697         struct page *freelist;
3698
3699         if (iommu_no_mapping(dev))
3700                 return;
3701
3702         domain = find_domain(dev);
3703         BUG_ON(!domain);
3704
3705         iommu = domain_get_iommu(domain);
3706
3707         iova_pfn = IOVA_PFN(dev_addr);
3708
3709         nrpages = aligned_nrpages(dev_addr, size);
3710         start_pfn = mm_to_dma_pfn(iova_pfn);
3711         last_pfn = start_pfn + nrpages - 1;
3712
3713         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3714                  dev_name(dev), start_pfn, last_pfn);
3715
3716         freelist = domain_unmap(domain, start_pfn, last_pfn);
3717
3718         if (intel_iommu_strict || !has_iova_flush_queue(&domain->iovad)) {
3719                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3720                                       nrpages, !freelist, 0);
3721                 /* free iova */
3722                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3723                 dma_free_pagelist(freelist);
3724         } else {
3725                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3726                            (unsigned long)freelist);
3727                 /*
3728                  * queue up the release of the unmap to save the 1/6th of the
3729                  * cpu used up by the iotlb flush operation...
3730                  */
3731         }
3732 }
3733
3734 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3735                              size_t size, enum dma_data_direction dir,
3736                              unsigned long attrs)
3737 {
3738         intel_unmap(dev, dev_addr, size);
3739 }
3740
3741 static void *intel_alloc_coherent(struct device *dev, size_t size,
3742                                   dma_addr_t *dma_handle, gfp_t flags,
3743                                   unsigned long attrs)
3744 {
3745         struct page *page = NULL;
3746         int order;
3747
3748         size = PAGE_ALIGN(size);
3749         order = get_order(size);
3750
3751         if (!iommu_no_mapping(dev))
3752                 flags &= ~(GFP_DMA | GFP_DMA32);
3753         else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3754                 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3755                         flags |= GFP_DMA;
3756                 else
3757                         flags |= GFP_DMA32;
3758         }
3759
3760         if (gfpflags_allow_blocking(flags)) {
3761                 unsigned int count = size >> PAGE_SHIFT;
3762
3763                 page = dma_alloc_from_contiguous(dev, count, order, flags);
3764                 if (page && iommu_no_mapping(dev) &&
3765                     page_to_phys(page) + size > dev->coherent_dma_mask) {
3766                         dma_release_from_contiguous(dev, page, count);
3767                         page = NULL;
3768                 }
3769         }
3770
3771         if (!page)
3772                 page = alloc_pages(flags, order);
3773         if (!page)
3774                 return NULL;
3775         memset(page_address(page), 0, size);
3776
3777         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3778                                          DMA_BIDIRECTIONAL,
3779                                          dev->coherent_dma_mask);
3780         if (*dma_handle)
3781                 return page_address(page);
3782         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3783                 __free_pages(page, order);
3784
3785         return NULL;
3786 }
3787
3788 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3789                                 dma_addr_t dma_handle, unsigned long attrs)
3790 {
3791         int order;
3792         struct page *page = virt_to_page(vaddr);
3793
3794         size = PAGE_ALIGN(size);
3795         order = get_order(size);
3796
3797         intel_unmap(dev, dma_handle, size);
3798         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3799                 __free_pages(page, order);
3800 }
3801
3802 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3803                            int nelems, enum dma_data_direction dir,
3804                            unsigned long attrs)
3805 {
3806         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3807         unsigned long nrpages = 0;
3808         struct scatterlist *sg;
3809         int i;
3810
3811         for_each_sg(sglist, sg, nelems, i) {
3812                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3813         }
3814
3815         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3816 }
3817
3818 static int intel_nontranslate_map_sg(struct device *hddev,
3819         struct scatterlist *sglist, int nelems, int dir)
3820 {
3821         int i;
3822         struct scatterlist *sg;
3823
3824         for_each_sg(sglist, sg, nelems, i) {
3825                 BUG_ON(!sg_page(sg));
3826                 sg->dma_address = sg_phys(sg);
3827                 sg->dma_length = sg->length;
3828         }
3829         return nelems;
3830 }
3831
3832 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3833                         enum dma_data_direction dir, unsigned long attrs)
3834 {
3835         int i;
3836         struct dmar_domain *domain;
3837         size_t size = 0;
3838         int prot = 0;
3839         unsigned long iova_pfn;
3840         int ret;
3841         struct scatterlist *sg;
3842         unsigned long start_vpfn;
3843         struct intel_iommu *iommu;
3844
3845         BUG_ON(dir == DMA_NONE);
3846         if (iommu_no_mapping(dev))
3847                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3848
3849         domain = get_valid_domain_for_dev(dev);
3850         if (!domain)
3851                 return 0;
3852
3853         iommu = domain_get_iommu(domain);
3854
3855         for_each_sg(sglist, sg, nelems, i)
3856                 size += aligned_nrpages(sg->offset, sg->length);
3857
3858         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3859                                 *dev->dma_mask);
3860         if (!iova_pfn) {
3861                 sglist->dma_length = 0;
3862                 return 0;
3863         }
3864
3865         /*
3866          * Check if DMAR supports zero-length reads on write only
3867          * mappings..
3868          */
3869         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3870                         !cap_zlr(iommu->cap))
3871                 prot |= DMA_PTE_READ;
3872         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3873                 prot |= DMA_PTE_WRITE;
3874
3875         start_vpfn = mm_to_dma_pfn(iova_pfn);
3876
3877         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3878         if (unlikely(ret)) {
3879                 dma_pte_free_pagetable(domain, start_vpfn,
3880                                        start_vpfn + size - 1,
3881                                        agaw_to_level(domain->agaw) + 1);
3882                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3883                 return 0;
3884         }
3885
3886         /* it's a non-present to present mapping. Only flush if caching mode */
3887         if (cap_caching_mode(iommu->cap))
3888                 iommu_flush_iotlb_psi(iommu, domain, start_vpfn, size, 0, 1);
3889         else
3890                 iommu_flush_write_buffer(iommu);
3891
3892         return nelems;
3893 }
3894
3895 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3896 {
3897         return !dma_addr;
3898 }
3899
3900 const struct dma_map_ops intel_dma_ops = {
3901         .alloc = intel_alloc_coherent,
3902         .free = intel_free_coherent,
3903         .map_sg = intel_map_sg,
3904         .unmap_sg = intel_unmap_sg,
3905         .map_page = intel_map_page,
3906         .unmap_page = intel_unmap_page,
3907         .mapping_error = intel_mapping_error,
3908 #ifdef CONFIG_X86
3909         .dma_supported = x86_dma_supported,
3910 #endif
3911 };
3912
3913 static inline int iommu_domain_cache_init(void)
3914 {
3915         int ret = 0;
3916
3917         iommu_domain_cache = kmem_cache_create("iommu_domain",
3918                                          sizeof(struct dmar_domain),
3919                                          0,
3920                                          SLAB_HWCACHE_ALIGN,
3921
3922                                          NULL);
3923         if (!iommu_domain_cache) {
3924                 pr_err("Couldn't create iommu_domain cache\n");
3925                 ret = -ENOMEM;
3926         }
3927
3928         return ret;
3929 }
3930
3931 static inline int iommu_devinfo_cache_init(void)
3932 {
3933         int ret = 0;
3934
3935         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3936                                          sizeof(struct device_domain_info),
3937                                          0,
3938                                          SLAB_HWCACHE_ALIGN,
3939                                          NULL);
3940         if (!iommu_devinfo_cache) {
3941                 pr_err("Couldn't create devinfo cache\n");
3942                 ret = -ENOMEM;
3943         }
3944
3945         return ret;
3946 }
3947
3948 static int __init iommu_init_mempool(void)
3949 {
3950         int ret;
3951         ret = iova_cache_get();
3952         if (ret)
3953                 return ret;
3954
3955         ret = iommu_domain_cache_init();
3956         if (ret)
3957                 goto domain_error;
3958
3959         ret = iommu_devinfo_cache_init();
3960         if (!ret)
3961                 return ret;
3962
3963         kmem_cache_destroy(iommu_domain_cache);
3964 domain_error:
3965         iova_cache_put();
3966
3967         return -ENOMEM;
3968 }
3969
3970 static void __init iommu_exit_mempool(void)
3971 {
3972         kmem_cache_destroy(iommu_devinfo_cache);
3973         kmem_cache_destroy(iommu_domain_cache);
3974         iova_cache_put();
3975 }
3976
3977 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3978 {
3979         struct dmar_drhd_unit *drhd;
3980         u32 vtbar;
3981         int rc;
3982
3983         /* We know that this device on this chipset has its own IOMMU.
3984          * If we find it under a different IOMMU, then the BIOS is lying
3985          * to us. Hope that the IOMMU for this device is actually
3986          * disabled, and it needs no translation...
3987          */
3988         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3989         if (rc) {
3990                 /* "can't" happen */
3991                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3992                 return;
3993         }
3994         vtbar &= 0xffff0000;
3995
3996         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3997         drhd = dmar_find_matched_drhd_unit(pdev);
3998         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
3999                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
4000                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4001                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4002         }
4003 }
4004 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4005
4006 static void __init init_no_remapping_devices(void)
4007 {
4008         struct dmar_drhd_unit *drhd;
4009         struct device *dev;
4010         int i;
4011
4012         for_each_drhd_unit(drhd) {
4013                 if (!drhd->include_all) {
4014                         for_each_active_dev_scope(drhd->devices,
4015                                                   drhd->devices_cnt, i, dev)
4016                                 break;
4017                         /* ignore DMAR unit if no devices exist */
4018                         if (i == drhd->devices_cnt)
4019                                 drhd->ignored = 1;
4020                 }
4021         }
4022
4023         for_each_active_drhd_unit(drhd) {
4024                 if (drhd->include_all)
4025                         continue;
4026
4027                 for_each_active_dev_scope(drhd->devices,
4028                                           drhd->devices_cnt, i, dev)
4029                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4030                                 break;
4031                 if (i < drhd->devices_cnt)
4032                         continue;
4033
4034                 /* This IOMMU has *only* gfx devices. Either bypass it or
4035                    set the gfx_mapped flag, as appropriate */
4036                 if (!dmar_map_gfx) {
4037                         drhd->ignored = 1;
4038                         for_each_active_dev_scope(drhd->devices,
4039                                                   drhd->devices_cnt, i, dev)
4040                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4041                 }
4042         }
4043 }
4044
4045 #ifdef CONFIG_SUSPEND
4046 static int init_iommu_hw(void)
4047 {
4048         struct dmar_drhd_unit *drhd;
4049         struct intel_iommu *iommu = NULL;
4050
4051         for_each_active_iommu(iommu, drhd)
4052                 if (iommu->qi)
4053                         dmar_reenable_qi(iommu);
4054
4055         for_each_iommu(iommu, drhd) {
4056                 if (drhd->ignored) {
4057                         /*
4058                          * we always have to disable PMRs or DMA may fail on
4059                          * this device
4060                          */
4061                         if (force_on)
4062                                 iommu_disable_protect_mem_regions(iommu);
4063                         continue;
4064                 }
4065
4066                 iommu_flush_write_buffer(iommu);
4067
4068                 iommu_set_root_entry(iommu);
4069
4070                 iommu->flush.flush_context(iommu, 0, 0, 0,
4071                                            DMA_CCMD_GLOBAL_INVL);
4072                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4073                 iommu_enable_translation(iommu);
4074                 iommu_disable_protect_mem_regions(iommu);
4075         }
4076
4077         return 0;
4078 }
4079
4080 static void iommu_flush_all(void)
4081 {
4082         struct dmar_drhd_unit *drhd;
4083         struct intel_iommu *iommu;
4084
4085         for_each_active_iommu(iommu, drhd) {
4086                 iommu->flush.flush_context(iommu, 0, 0, 0,
4087                                            DMA_CCMD_GLOBAL_INVL);
4088                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4089                                          DMA_TLB_GLOBAL_FLUSH);
4090         }
4091 }
4092
4093 static int iommu_suspend(void)
4094 {
4095         struct dmar_drhd_unit *drhd;
4096         struct intel_iommu *iommu = NULL;
4097         unsigned long flag;
4098
4099         for_each_active_iommu(iommu, drhd) {
4100                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
4101                                                  GFP_ATOMIC);
4102                 if (!iommu->iommu_state)
4103                         goto nomem;
4104         }
4105
4106         iommu_flush_all();
4107
4108         for_each_active_iommu(iommu, drhd) {
4109                 iommu_disable_translation(iommu);
4110
4111                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4112
4113                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4114                         readl(iommu->reg + DMAR_FECTL_REG);
4115                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4116                         readl(iommu->reg + DMAR_FEDATA_REG);
4117                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4118                         readl(iommu->reg + DMAR_FEADDR_REG);
4119                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4120                         readl(iommu->reg + DMAR_FEUADDR_REG);
4121
4122                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4123         }
4124         return 0;
4125
4126 nomem:
4127         for_each_active_iommu(iommu, drhd)
4128                 kfree(iommu->iommu_state);
4129
4130         return -ENOMEM;
4131 }
4132
4133 static void iommu_resume(void)
4134 {
4135         struct dmar_drhd_unit *drhd;
4136         struct intel_iommu *iommu = NULL;
4137         unsigned long flag;
4138
4139         if (init_iommu_hw()) {
4140                 if (force_on)
4141                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4142                 else
4143                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4144                 return;
4145         }
4146
4147         for_each_active_iommu(iommu, drhd) {
4148
4149                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4150
4151                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4152                         iommu->reg + DMAR_FECTL_REG);
4153                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4154                         iommu->reg + DMAR_FEDATA_REG);
4155                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4156                         iommu->reg + DMAR_FEADDR_REG);
4157                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4158                         iommu->reg + DMAR_FEUADDR_REG);
4159
4160                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4161         }
4162
4163         for_each_active_iommu(iommu, drhd)
4164                 kfree(iommu->iommu_state);
4165 }
4166
4167 static struct syscore_ops iommu_syscore_ops = {
4168         .resume         = iommu_resume,
4169         .suspend        = iommu_suspend,
4170 };
4171
4172 static void __init init_iommu_pm_ops(void)
4173 {
4174         register_syscore_ops(&iommu_syscore_ops);
4175 }
4176
4177 #else
4178 static inline void init_iommu_pm_ops(void) {}
4179 #endif  /* CONFIG_PM */
4180
4181
4182 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4183 {
4184         struct acpi_dmar_reserved_memory *rmrr;
4185         struct dmar_rmrr_unit *rmrru;
4186         size_t length;
4187
4188         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4189         if (!rmrru)
4190                 goto out;
4191
4192         rmrru->hdr = header;
4193         rmrr = (struct acpi_dmar_reserved_memory *)header;
4194         rmrru->base_address = rmrr->base_address;
4195         rmrru->end_address = rmrr->end_address;
4196
4197         length = rmrr->end_address - rmrr->base_address + 1;
4198
4199         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4200                                 ((void *)rmrr) + rmrr->header.length,
4201                                 &rmrru->devices_cnt);
4202         if (rmrru->devices_cnt && rmrru->devices == NULL)
4203                 goto free_rmrru;
4204
4205         list_add(&rmrru->list, &dmar_rmrr_units);
4206
4207         return 0;
4208 free_rmrru:
4209         kfree(rmrru);
4210 out:
4211         return -ENOMEM;
4212 }
4213
4214 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4215 {
4216         struct dmar_atsr_unit *atsru;
4217         struct acpi_dmar_atsr *tmp;
4218
4219         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4220                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4221                 if (atsr->segment != tmp->segment)
4222                         continue;
4223                 if (atsr->header.length != tmp->header.length)
4224                         continue;
4225                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4226                         return atsru;
4227         }
4228
4229         return NULL;
4230 }
4231
4232 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4233 {
4234         struct acpi_dmar_atsr *atsr;
4235         struct dmar_atsr_unit *atsru;
4236
4237         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4238                 return 0;
4239
4240         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4241         atsru = dmar_find_atsr(atsr);
4242         if (atsru)
4243                 return 0;
4244
4245         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4246         if (!atsru)
4247                 return -ENOMEM;
4248
4249         /*
4250          * If memory is allocated from slab by ACPI _DSM method, we need to
4251          * copy the memory content because the memory buffer will be freed
4252          * on return.
4253          */
4254         atsru->hdr = (void *)(atsru + 1);
4255         memcpy(atsru->hdr, hdr, hdr->length);
4256         atsru->include_all = atsr->flags & 0x1;
4257         if (!atsru->include_all) {
4258                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4259                                 (void *)atsr + atsr->header.length,
4260                                 &atsru->devices_cnt);
4261                 if (atsru->devices_cnt && atsru->devices == NULL) {
4262                         kfree(atsru);
4263                         return -ENOMEM;
4264                 }
4265         }
4266
4267         list_add_rcu(&atsru->list, &dmar_atsr_units);
4268
4269         return 0;
4270 }
4271
4272 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4273 {
4274         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4275         kfree(atsru);
4276 }
4277
4278 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4279 {
4280         struct acpi_dmar_atsr *atsr;
4281         struct dmar_atsr_unit *atsru;
4282
4283         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4284         atsru = dmar_find_atsr(atsr);
4285         if (atsru) {
4286                 list_del_rcu(&atsru->list);
4287                 synchronize_rcu();
4288                 intel_iommu_free_atsr(atsru);
4289         }
4290
4291         return 0;
4292 }
4293
4294 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4295 {
4296         int i;
4297         struct device *dev;
4298         struct acpi_dmar_atsr *atsr;
4299         struct dmar_atsr_unit *atsru;
4300
4301         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4302         atsru = dmar_find_atsr(atsr);
4303         if (!atsru)
4304                 return 0;
4305
4306         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4307                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4308                                           i, dev)
4309                         return -EBUSY;
4310         }
4311
4312         return 0;
4313 }
4314
4315 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4316 {
4317         int sp, ret = 0;
4318         struct intel_iommu *iommu = dmaru->iommu;
4319
4320         if (g_iommus[iommu->seq_id])
4321                 return 0;
4322
4323         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4324                 pr_warn("%s: Doesn't support hardware pass through.\n",
4325                         iommu->name);
4326                 return -ENXIO;
4327         }
4328         if (!ecap_sc_support(iommu->ecap) &&
4329             domain_update_iommu_snooping(iommu)) {
4330                 pr_warn("%s: Doesn't support snooping.\n",
4331                         iommu->name);
4332                 return -ENXIO;
4333         }
4334         sp = domain_update_iommu_superpage(iommu) - 1;
4335         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4336                 pr_warn("%s: Doesn't support large page.\n",
4337                         iommu->name);
4338                 return -ENXIO;
4339         }
4340
4341         /*
4342          * Disable translation if already enabled prior to OS handover.
4343          */
4344         if (iommu->gcmd & DMA_GCMD_TE)
4345                 iommu_disable_translation(iommu);
4346
4347         g_iommus[iommu->seq_id] = iommu;
4348         ret = iommu_init_domains(iommu);
4349         if (ret == 0)
4350                 ret = iommu_alloc_root_entry(iommu);
4351         if (ret)
4352                 goto out;
4353
4354 #ifdef CONFIG_INTEL_IOMMU_SVM
4355         if (pasid_enabled(iommu))
4356                 intel_svm_alloc_pasid_tables(iommu);
4357 #endif
4358
4359         if (dmaru->ignored) {
4360                 /*
4361                  * we always have to disable PMRs or DMA may fail on this device
4362                  */
4363                 if (force_on)
4364                         iommu_disable_protect_mem_regions(iommu);
4365                 return 0;
4366         }
4367
4368         intel_iommu_init_qi(iommu);
4369         iommu_flush_write_buffer(iommu);
4370
4371 #ifdef CONFIG_INTEL_IOMMU_SVM
4372         if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
4373                 ret = intel_svm_enable_prq(iommu);
4374                 if (ret)
4375                         goto disable_iommu;
4376         }
4377 #endif
4378         ret = dmar_set_interrupt(iommu);
4379         if (ret)
4380                 goto disable_iommu;
4381
4382         iommu_set_root_entry(iommu);
4383         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4384         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4385         iommu_enable_translation(iommu);
4386
4387         iommu_disable_protect_mem_regions(iommu);
4388         return 0;
4389
4390 disable_iommu:
4391         disable_dmar_iommu(iommu);
4392 out:
4393         free_dmar_iommu(iommu);
4394         return ret;
4395 }
4396
4397 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4398 {
4399         int ret = 0;
4400         struct intel_iommu *iommu = dmaru->iommu;
4401
4402         if (!intel_iommu_enabled)
4403                 return 0;
4404         if (iommu == NULL)
4405                 return -EINVAL;
4406
4407         if (insert) {
4408                 ret = intel_iommu_add(dmaru);
4409         } else {
4410                 disable_dmar_iommu(iommu);
4411                 free_dmar_iommu(iommu);
4412         }
4413
4414         return ret;
4415 }
4416
4417 static void intel_iommu_free_dmars(void)
4418 {
4419         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4420         struct dmar_atsr_unit *atsru, *atsr_n;
4421
4422         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4423                 list_del(&rmrru->list);
4424                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4425                 kfree(rmrru);
4426         }
4427
4428         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4429                 list_del(&atsru->list);
4430                 intel_iommu_free_atsr(atsru);
4431         }
4432 }
4433
4434 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4435 {
4436         int i, ret = 1;
4437         struct pci_bus *bus;
4438         struct pci_dev *bridge = NULL;
4439         struct device *tmp;
4440         struct acpi_dmar_atsr *atsr;
4441         struct dmar_atsr_unit *atsru;
4442
4443         dev = pci_physfn(dev);
4444         for (bus = dev->bus; bus; bus = bus->parent) {
4445                 bridge = bus->self;
4446                 /* If it's an integrated device, allow ATS */
4447                 if (!bridge)
4448                         return 1;
4449                 /* Connected via non-PCIe: no ATS */
4450                 if (!pci_is_pcie(bridge) ||
4451                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4452                         return 0;
4453                 /* If we found the root port, look it up in the ATSR */
4454                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4455                         break;
4456         }
4457
4458         rcu_read_lock();
4459         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4460                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4461                 if (atsr->segment != pci_domain_nr(dev->bus))
4462                         continue;
4463
4464                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4465                         if (tmp == &bridge->dev)
4466                                 goto out;
4467
4468                 if (atsru->include_all)
4469                         goto out;
4470         }
4471         ret = 0;
4472 out:
4473         rcu_read_unlock();
4474
4475         return ret;
4476 }
4477
4478 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4479 {
4480         int ret = 0;
4481         struct dmar_rmrr_unit *rmrru;
4482         struct dmar_atsr_unit *atsru;
4483         struct acpi_dmar_atsr *atsr;
4484         struct acpi_dmar_reserved_memory *rmrr;
4485
4486         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4487                 return 0;
4488
4489         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4490                 rmrr = container_of(rmrru->hdr,
4491                                     struct acpi_dmar_reserved_memory, header);
4492                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4493                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4494                                 ((void *)rmrr) + rmrr->header.length,
4495                                 rmrr->segment, rmrru->devices,
4496                                 rmrru->devices_cnt);
4497                         if(ret < 0)
4498                                 return ret;
4499                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4500                         dmar_remove_dev_scope(info, rmrr->segment,
4501                                 rmrru->devices, rmrru->devices_cnt);
4502                 }
4503         }
4504
4505         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4506                 if (atsru->include_all)
4507                         continue;
4508
4509                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4510                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4511                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4512                                         (void *)atsr + atsr->header.length,
4513                                         atsr->segment, atsru->devices,
4514                                         atsru->devices_cnt);
4515                         if (ret > 0)
4516                                 break;
4517                         else if(ret < 0)
4518                                 return ret;
4519                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4520                         if (dmar_remove_dev_scope(info, atsr->segment,
4521                                         atsru->devices, atsru->devices_cnt))
4522                                 break;
4523                 }
4524         }
4525
4526         return 0;
4527 }
4528
4529 /*
4530  * Here we only respond to action of unbound device from driver.
4531  *
4532  * Added device is not attached to its DMAR domain here yet. That will happen
4533  * when mapping the device to iova.
4534  */
4535 static int device_notifier(struct notifier_block *nb,
4536                                   unsigned long action, void *data)
4537 {
4538         struct device *dev = data;
4539         struct dmar_domain *domain;
4540
4541         if (iommu_dummy(dev))
4542                 return 0;
4543
4544         if (action != BUS_NOTIFY_REMOVED_DEVICE)
4545                 return 0;
4546
4547         domain = find_domain(dev);
4548         if (!domain)
4549                 return 0;
4550
4551         dmar_remove_one_dev_info(domain, dev);
4552         if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4553                 domain_exit(domain);
4554
4555         return 0;
4556 }
4557
4558 static struct notifier_block device_nb = {
4559         .notifier_call = device_notifier,
4560 };
4561
4562 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4563                                        unsigned long val, void *v)
4564 {
4565         struct memory_notify *mhp = v;
4566         unsigned long long start, end;
4567         unsigned long start_vpfn, last_vpfn;
4568
4569         switch (val) {
4570         case MEM_GOING_ONLINE:
4571                 start = mhp->start_pfn << PAGE_SHIFT;
4572                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4573                 if (iommu_domain_identity_map(si_domain, start, end)) {
4574                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4575                                 start, end);
4576                         return NOTIFY_BAD;
4577                 }
4578                 break;
4579
4580         case MEM_OFFLINE:
4581         case MEM_CANCEL_ONLINE:
4582                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4583                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4584                 while (start_vpfn <= last_vpfn) {
4585                         struct iova *iova;
4586                         struct dmar_drhd_unit *drhd;
4587                         struct intel_iommu *iommu;
4588                         struct page *freelist;
4589
4590                         iova = find_iova(&si_domain->iovad, start_vpfn);
4591                         if (iova == NULL) {
4592                                 pr_debug("Failed get IOVA for PFN %lx\n",
4593                                          start_vpfn);
4594                                 break;
4595                         }
4596
4597                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4598                                                      start_vpfn, last_vpfn);
4599                         if (iova == NULL) {
4600                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4601                                         start_vpfn, last_vpfn);
4602                                 return NOTIFY_BAD;
4603                         }
4604
4605                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4606                                                iova->pfn_hi);
4607
4608                         rcu_read_lock();
4609                         for_each_active_iommu(iommu, drhd)
4610                                 iommu_flush_iotlb_psi(iommu, si_domain,
4611                                         iova->pfn_lo, iova_size(iova),
4612                                         !freelist, 0);
4613                         rcu_read_unlock();
4614                         dma_free_pagelist(freelist);
4615
4616                         start_vpfn = iova->pfn_hi + 1;
4617                         free_iova_mem(iova);
4618                 }
4619                 break;
4620         }
4621
4622         return NOTIFY_OK;
4623 }
4624
4625 static struct notifier_block intel_iommu_memory_nb = {
4626         .notifier_call = intel_iommu_memory_notifier,
4627         .priority = 0
4628 };
4629
4630 static void free_all_cpu_cached_iovas(unsigned int cpu)
4631 {
4632         int i;
4633
4634         for (i = 0; i < g_num_of_iommus; i++) {
4635                 struct intel_iommu *iommu = g_iommus[i];
4636                 struct dmar_domain *domain;
4637                 int did;
4638
4639                 if (!iommu)
4640                         continue;
4641
4642                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4643                         domain = get_iommu_domain(iommu, (u16)did);
4644
4645                         if (!domain)
4646                                 continue;
4647                         free_cpu_cached_iovas(cpu, &domain->iovad);
4648                 }
4649         }
4650 }
4651
4652 static int intel_iommu_cpu_dead(unsigned int cpu)
4653 {
4654         free_all_cpu_cached_iovas(cpu);
4655         return 0;
4656 }
4657
4658 static void intel_disable_iommus(void)
4659 {
4660         struct intel_iommu *iommu = NULL;
4661         struct dmar_drhd_unit *drhd;
4662
4663         for_each_iommu(iommu, drhd)
4664                 iommu_disable_translation(iommu);
4665 }
4666
4667 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4668 {
4669         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4670
4671         return container_of(iommu_dev, struct intel_iommu, iommu);
4672 }
4673
4674 static ssize_t intel_iommu_show_version(struct device *dev,
4675                                         struct device_attribute *attr,
4676                                         char *buf)
4677 {
4678         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4679         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4680         return sprintf(buf, "%d:%d\n",
4681                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4682 }
4683 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4684
4685 static ssize_t intel_iommu_show_address(struct device *dev,
4686                                         struct device_attribute *attr,
4687                                         char *buf)
4688 {
4689         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4690         return sprintf(buf, "%llx\n", iommu->reg_phys);
4691 }
4692 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4693
4694 static ssize_t intel_iommu_show_cap(struct device *dev,
4695                                     struct device_attribute *attr,
4696                                     char *buf)
4697 {
4698         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4699         return sprintf(buf, "%llx\n", iommu->cap);
4700 }
4701 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4702
4703 static ssize_t intel_iommu_show_ecap(struct device *dev,
4704                                     struct device_attribute *attr,
4705                                     char *buf)
4706 {
4707         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4708         return sprintf(buf, "%llx\n", iommu->ecap);
4709 }
4710 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4711
4712 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4713                                       struct device_attribute *attr,
4714                                       char *buf)
4715 {
4716         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4717         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4718 }
4719 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4720
4721 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4722                                            struct device_attribute *attr,
4723                                            char *buf)
4724 {
4725         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4726         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4727                                                   cap_ndoms(iommu->cap)));
4728 }
4729 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4730
4731 static struct attribute *intel_iommu_attrs[] = {
4732         &dev_attr_version.attr,
4733         &dev_attr_address.attr,
4734         &dev_attr_cap.attr,
4735         &dev_attr_ecap.attr,
4736         &dev_attr_domains_supported.attr,
4737         &dev_attr_domains_used.attr,
4738         NULL,
4739 };
4740
4741 static struct attribute_group intel_iommu_group = {
4742         .name = "intel-iommu",
4743         .attrs = intel_iommu_attrs,
4744 };
4745
4746 const struct attribute_group *intel_iommu_groups[] = {
4747         &intel_iommu_group,
4748         NULL,
4749 };
4750
4751 int __init intel_iommu_init(void)
4752 {
4753         int ret = -ENODEV;
4754         struct dmar_drhd_unit *drhd;
4755         struct intel_iommu *iommu;
4756
4757         /* VT-d is required for a TXT/tboot launch, so enforce that */
4758         force_on = tboot_force_iommu();
4759
4760         if (iommu_init_mempool()) {
4761                 if (force_on)
4762                         panic("tboot: Failed to initialize iommu memory\n");
4763                 return -ENOMEM;
4764         }
4765
4766         down_write(&dmar_global_lock);
4767         if (dmar_table_init()) {
4768                 if (force_on)
4769                         panic("tboot: Failed to initialize DMAR table\n");
4770                 goto out_free_dmar;
4771         }
4772
4773         if (dmar_dev_scope_init() < 0) {
4774                 if (force_on)
4775                         panic("tboot: Failed to initialize DMAR device scope\n");
4776                 goto out_free_dmar;
4777         }
4778
4779         if (no_iommu || dmar_disabled) {
4780                 /*
4781                  * We exit the function here to ensure IOMMU's remapping and
4782                  * mempool aren't setup, which means that the IOMMU's PMRs
4783                  * won't be disabled via the call to init_dmars(). So disable
4784                  * it explicitly here. The PMRs were setup by tboot prior to
4785                  * calling SENTER, but the kernel is expected to reset/tear
4786                  * down the PMRs.
4787                  */
4788                 if (intel_iommu_tboot_noforce) {
4789                         for_each_iommu(iommu, drhd)
4790                                 iommu_disable_protect_mem_regions(iommu);
4791                 }
4792
4793                 /*
4794                  * Make sure the IOMMUs are switched off, even when we
4795                  * boot into a kexec kernel and the previous kernel left
4796                  * them enabled
4797                  */
4798                 intel_disable_iommus();
4799                 goto out_free_dmar;
4800         }
4801
4802         if (list_empty(&dmar_rmrr_units))
4803                 pr_info("No RMRR found\n");
4804
4805         if (list_empty(&dmar_atsr_units))
4806                 pr_info("No ATSR found\n");
4807
4808         if (dmar_init_reserved_ranges()) {
4809                 if (force_on)
4810                         panic("tboot: Failed to reserve iommu ranges\n");
4811                 goto out_free_reserved_range;
4812         }
4813
4814         if (dmar_map_gfx)
4815                 intel_iommu_gfx_mapped = 1;
4816
4817         init_no_remapping_devices();
4818
4819         ret = init_dmars();
4820         if (ret) {
4821                 if (force_on)
4822                         panic("tboot: Failed to initialize DMARs\n");
4823                 pr_err("Initialization failed\n");
4824                 goto out_free_reserved_range;
4825         }
4826         up_write(&dmar_global_lock);
4827         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4828
4829 #ifdef CONFIG_SWIOTLB
4830         swiotlb = 0;
4831 #endif
4832         dma_ops = &intel_dma_ops;
4833
4834         init_iommu_pm_ops();
4835
4836         for_each_active_iommu(iommu, drhd) {
4837                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4838                                        intel_iommu_groups,
4839                                        "%s", iommu->name);
4840                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4841                 iommu_device_register(&iommu->iommu);
4842         }
4843
4844         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4845         bus_register_notifier(&pci_bus_type, &device_nb);
4846         if (si_domain && !hw_pass_through)
4847                 register_memory_notifier(&intel_iommu_memory_nb);
4848         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4849                           intel_iommu_cpu_dead);
4850         intel_iommu_enabled = 1;
4851
4852         return 0;
4853
4854 out_free_reserved_range:
4855         put_iova_domain(&reserved_iova_list);
4856 out_free_dmar:
4857         intel_iommu_free_dmars();
4858         up_write(&dmar_global_lock);
4859         iommu_exit_mempool();
4860         return ret;
4861 }
4862
4863 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4864 {
4865         struct intel_iommu *iommu = opaque;
4866
4867         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4868         return 0;
4869 }
4870
4871 /*
4872  * NB - intel-iommu lacks any sort of reference counting for the users of
4873  * dependent devices.  If multiple endpoints have intersecting dependent
4874  * devices, unbinding the driver from any one of them will possibly leave
4875  * the others unable to operate.
4876  */
4877 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4878 {
4879         if (!iommu || !dev || !dev_is_pci(dev))
4880                 return;
4881
4882         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4883 }
4884
4885 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4886 {
4887         struct intel_iommu *iommu;
4888         unsigned long flags;
4889
4890         assert_spin_locked(&device_domain_lock);
4891
4892         if (WARN_ON(!info))
4893                 return;
4894
4895         iommu = info->iommu;
4896
4897         if (info->dev) {
4898                 iommu_disable_dev_iotlb(info);
4899                 domain_context_clear(iommu, info->dev);
4900         }
4901
4902         unlink_domain_info(info);
4903
4904         spin_lock_irqsave(&iommu->lock, flags);
4905         domain_detach_iommu(info->domain, iommu);
4906         spin_unlock_irqrestore(&iommu->lock, flags);
4907
4908         free_devinfo_mem(info);
4909 }
4910
4911 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4912                                      struct device *dev)
4913 {
4914         struct device_domain_info *info;
4915         unsigned long flags;
4916
4917         spin_lock_irqsave(&device_domain_lock, flags);
4918         info = dev->archdata.iommu;
4919         __dmar_remove_one_dev_info(info);
4920         spin_unlock_irqrestore(&device_domain_lock, flags);
4921 }
4922
4923 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4924 {
4925         int adjust_width;
4926
4927         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
4928                         DMA_32BIT_PFN);
4929         domain_reserve_special_ranges(domain);
4930
4931         /* calculate AGAW */
4932         domain->gaw = guest_width;
4933         adjust_width = guestwidth_to_adjustwidth(guest_width);
4934         domain->agaw = width_to_agaw(adjust_width);
4935
4936         domain->iommu_coherency = 0;
4937         domain->iommu_snooping = 0;
4938         domain->iommu_superpage = 0;
4939         domain->max_addr = 0;
4940
4941         /* always allocate the top pgd */
4942         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4943         if (!domain->pgd)
4944                 return -ENOMEM;
4945         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4946         return 0;
4947 }
4948
4949 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4950 {
4951         struct dmar_domain *dmar_domain;
4952         struct iommu_domain *domain;
4953
4954         if (type != IOMMU_DOMAIN_UNMANAGED)
4955                 return NULL;
4956
4957         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4958         if (!dmar_domain) {
4959                 pr_err("Can't allocate dmar_domain\n");
4960                 return NULL;
4961         }
4962         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4963                 pr_err("Domain initialization failed\n");
4964                 domain_exit(dmar_domain);
4965                 return NULL;
4966         }
4967         domain_update_iommu_cap(dmar_domain);
4968
4969         domain = &dmar_domain->domain;
4970         domain->geometry.aperture_start = 0;
4971         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4972         domain->geometry.force_aperture = true;
4973
4974         return domain;
4975 }
4976
4977 static void intel_iommu_domain_free(struct iommu_domain *domain)
4978 {
4979         domain_exit(to_dmar_domain(domain));
4980 }
4981
4982 static int intel_iommu_attach_device(struct iommu_domain *domain,
4983                                      struct device *dev)
4984 {
4985         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4986         struct intel_iommu *iommu;
4987         int addr_width;
4988         u8 bus, devfn;
4989
4990         if (device_is_rmrr_locked(dev)) {
4991                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4992                 return -EPERM;
4993         }
4994
4995         /* normally dev is not mapped */
4996         if (unlikely(domain_context_mapped(dev))) {
4997                 struct dmar_domain *old_domain;
4998
4999                 old_domain = find_domain(dev);
5000                 if (old_domain) {
5001                         rcu_read_lock();
5002                         dmar_remove_one_dev_info(old_domain, dev);
5003                         rcu_read_unlock();
5004
5005                         if (!domain_type_is_vm_or_si(old_domain) &&
5006                              list_empty(&old_domain->devices))
5007                                 domain_exit(old_domain);
5008                 }
5009         }
5010
5011         iommu = device_to_iommu(dev, &bus, &devfn);
5012         if (!iommu)
5013                 return -ENODEV;
5014
5015         /* check if this iommu agaw is sufficient for max mapped address */
5016         addr_width = agaw_to_width(iommu->agaw);
5017         if (addr_width > cap_mgaw(iommu->cap))
5018                 addr_width = cap_mgaw(iommu->cap);
5019
5020         if (dmar_domain->max_addr > (1LL << addr_width)) {
5021                 pr_err("%s: iommu width (%d) is not "
5022                        "sufficient for the mapped address (%llx)\n",
5023                        __func__, addr_width, dmar_domain->max_addr);
5024                 return -EFAULT;
5025         }
5026         dmar_domain->gaw = addr_width;
5027
5028         /*
5029          * Knock out extra levels of page tables if necessary
5030          */
5031         while (iommu->agaw < dmar_domain->agaw) {
5032                 struct dma_pte *pte;
5033
5034                 pte = dmar_domain->pgd;
5035                 if (dma_pte_present(pte)) {
5036                         dmar_domain->pgd = (struct dma_pte *)
5037                                 phys_to_virt(dma_pte_addr(pte));
5038                         free_pgtable_page(pte);
5039                 }
5040                 dmar_domain->agaw--;
5041         }
5042
5043         return domain_add_dev_info(dmar_domain, dev);
5044 }
5045
5046 static void intel_iommu_detach_device(struct iommu_domain *domain,
5047                                       struct device *dev)
5048 {
5049         dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
5050 }
5051
5052 static int intel_iommu_map(struct iommu_domain *domain,
5053                            unsigned long iova, phys_addr_t hpa,
5054                            size_t size, int iommu_prot)
5055 {
5056         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5057         u64 max_addr;
5058         int prot = 0;
5059         int ret;
5060
5061         if (iommu_prot & IOMMU_READ)
5062                 prot |= DMA_PTE_READ;
5063         if (iommu_prot & IOMMU_WRITE)
5064                 prot |= DMA_PTE_WRITE;
5065         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5066                 prot |= DMA_PTE_SNP;
5067
5068         max_addr = iova + size;
5069         if (dmar_domain->max_addr < max_addr) {
5070                 u64 end;
5071
5072                 /* check if minimum agaw is sufficient for mapped address */
5073                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5074                 if (end < max_addr) {
5075                         pr_err("%s: iommu width (%d) is not "
5076                                "sufficient for the mapped address (%llx)\n",
5077                                __func__, dmar_domain->gaw, max_addr);
5078                         return -EFAULT;
5079                 }
5080                 dmar_domain->max_addr = max_addr;
5081         }
5082         /* Round up size to next multiple of PAGE_SIZE, if it and
5083            the low bits of hpa would take us onto the next page */
5084         size = aligned_nrpages(hpa, size);
5085         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5086                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5087         return ret;
5088 }
5089
5090 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5091                                 unsigned long iova, size_t size)
5092 {
5093         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5094         struct page *freelist = NULL;
5095         struct intel_iommu *iommu;
5096         unsigned long start_pfn, last_pfn;
5097         unsigned int npages;
5098         int iommu_id, level = 0;
5099
5100         /* Cope with horrid API which requires us to unmap more than the
5101            size argument if it happens to be a large-page mapping. */
5102         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5103
5104         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5105                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5106
5107         start_pfn = iova >> VTD_PAGE_SHIFT;
5108         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5109
5110         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5111
5112         npages = last_pfn - start_pfn + 1;
5113
5114         for_each_domain_iommu(iommu_id, dmar_domain) {
5115                 iommu = g_iommus[iommu_id];
5116
5117                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5118                                       start_pfn, npages, !freelist, 0);
5119         }
5120
5121         dma_free_pagelist(freelist);
5122
5123         if (dmar_domain->max_addr == iova + size)
5124                 dmar_domain->max_addr = iova;
5125
5126         return size;
5127 }
5128
5129 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5130                                             dma_addr_t iova)
5131 {
5132         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5133         struct dma_pte *pte;
5134         int level = 0;
5135         u64 phys = 0;
5136
5137         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5138         if (pte && dma_pte_present(pte))
5139                 phys = dma_pte_addr(pte) +
5140                         (iova & (BIT_MASK(level_to_offset_bits(level) +
5141                                                 VTD_PAGE_SHIFT) - 1));
5142
5143         return phys;
5144 }
5145
5146 static bool intel_iommu_capable(enum iommu_cap cap)
5147 {
5148         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5149                 return domain_update_iommu_snooping(NULL) == 1;
5150         if (cap == IOMMU_CAP_INTR_REMAP)
5151                 return irq_remapping_enabled == 1;
5152
5153         return false;
5154 }
5155
5156 static int intel_iommu_add_device(struct device *dev)
5157 {
5158         struct intel_iommu *iommu;
5159         struct iommu_group *group;
5160         u8 bus, devfn;
5161
5162         iommu = device_to_iommu(dev, &bus, &devfn);
5163         if (!iommu)
5164                 return -ENODEV;
5165
5166         iommu_device_link(&iommu->iommu, dev);
5167
5168         group = iommu_group_get_for_dev(dev);
5169
5170         if (IS_ERR(group))
5171                 return PTR_ERR(group);
5172
5173         iommu_group_put(group);
5174         return 0;
5175 }
5176
5177 static void intel_iommu_remove_device(struct device *dev)
5178 {
5179         struct intel_iommu *iommu;
5180         u8 bus, devfn;
5181
5182         iommu = device_to_iommu(dev, &bus, &devfn);
5183         if (!iommu)
5184                 return;
5185
5186         iommu_group_remove_device(dev);
5187
5188         iommu_device_unlink(&iommu->iommu, dev);
5189 }
5190
5191 static void intel_iommu_get_resv_regions(struct device *device,
5192                                          struct list_head *head)
5193 {
5194         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5195         struct iommu_resv_region *reg;
5196         struct dmar_rmrr_unit *rmrr;
5197         struct device *i_dev;
5198         int i;
5199
5200         down_read(&dmar_global_lock);
5201         for_each_rmrr_units(rmrr) {
5202                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5203                                           i, i_dev) {
5204                         struct iommu_resv_region *resv;
5205                         size_t length;
5206
5207                         if (i_dev != device)
5208                                 continue;
5209
5210                         length = rmrr->end_address - rmrr->base_address + 1;
5211                         resv = iommu_alloc_resv_region(rmrr->base_address,
5212                                                        length, prot,
5213                                                        IOMMU_RESV_DIRECT);
5214                         if (!resv)
5215                                 break;
5216
5217                         list_add_tail(&resv->list, head);
5218                 }
5219         }
5220         up_read(&dmar_global_lock);
5221
5222         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5223                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5224                                       0, IOMMU_RESV_MSI);
5225         if (!reg)
5226                 return;
5227         list_add_tail(&reg->list, head);
5228 }
5229
5230 static void intel_iommu_put_resv_regions(struct device *dev,
5231                                          struct list_head *head)
5232 {
5233         struct iommu_resv_region *entry, *next;
5234
5235         list_for_each_entry_safe(entry, next, head, list)
5236                 kfree(entry);
5237 }
5238
5239 #ifdef CONFIG_INTEL_IOMMU_SVM
5240 #define MAX_NR_PASID_BITS (20)
5241 static inline unsigned long intel_iommu_get_pts(struct intel_iommu *iommu)
5242 {
5243         /*
5244          * Convert ecap_pss to extend context entry pts encoding, also
5245          * respect the soft pasid_max value set by the iommu.
5246          * - number of PASID bits = ecap_pss + 1
5247          * - number of PASID table entries = 2^(pts + 5)
5248          * Therefore, pts = ecap_pss - 4
5249          * e.g. KBL ecap_pss = 0x13, PASID has 20 bits, pts = 15
5250          */
5251         if (ecap_pss(iommu->ecap) < 5)
5252                 return 0;
5253
5254         /* pasid_max is encoded as actual number of entries not the bits */
5255         return find_first_bit((unsigned long *)&iommu->pasid_max,
5256                         MAX_NR_PASID_BITS) - 5;
5257 }
5258
5259 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5260 {
5261         struct device_domain_info *info;
5262         struct context_entry *context;
5263         struct dmar_domain *domain;
5264         unsigned long flags;
5265         u64 ctx_lo;
5266         int ret;
5267
5268         domain = get_valid_domain_for_dev(sdev->dev);
5269         if (!domain)
5270                 return -EINVAL;
5271
5272         spin_lock_irqsave(&device_domain_lock, flags);
5273         spin_lock(&iommu->lock);
5274
5275         ret = -EINVAL;
5276         info = sdev->dev->archdata.iommu;
5277         if (!info || !info->pasid_supported)
5278                 goto out;
5279
5280         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5281         if (WARN_ON(!context))
5282                 goto out;
5283
5284         ctx_lo = context[0].lo;
5285
5286         sdev->did = domain->iommu_did[iommu->seq_id];
5287         sdev->sid = PCI_DEVID(info->bus, info->devfn);
5288
5289         if (!(ctx_lo & CONTEXT_PASIDE)) {
5290                 if (iommu->pasid_state_table)
5291                         context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5292                 context[1].lo = (u64)virt_to_phys(iommu->pasid_table) |
5293                         intel_iommu_get_pts(iommu);
5294
5295                 wmb();
5296                 /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5297                  * extended to permit requests-with-PASID if the PASIDE bit
5298                  * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5299                  * however, the PASIDE bit is ignored and requests-with-PASID
5300                  * are unconditionally blocked. Which makes less sense.
5301                  * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5302                  * "guest mode" translation types depending on whether ATS
5303                  * is available or not. Annoyingly, we can't use the new
5304                  * modes *unless* PASIDE is set. */
5305                 if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5306                         ctx_lo &= ~CONTEXT_TT_MASK;
5307                         if (info->ats_supported)
5308                                 ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5309                         else
5310                                 ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5311                 }
5312                 ctx_lo |= CONTEXT_PASIDE;
5313                 if (iommu->pasid_state_table)
5314                         ctx_lo |= CONTEXT_DINVE;
5315                 if (info->pri_supported)
5316                         ctx_lo |= CONTEXT_PRS;
5317                 context[0].lo = ctx_lo;
5318                 wmb();
5319                 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5320                                            DMA_CCMD_MASK_NOBIT,
5321                                            DMA_CCMD_DEVICE_INVL);
5322         }
5323
5324         /* Enable PASID support in the device, if it wasn't already */
5325         if (!info->pasid_enabled)
5326                 iommu_enable_dev_iotlb(info);
5327
5328         if (info->ats_enabled) {
5329                 sdev->dev_iotlb = 1;
5330                 sdev->qdep = info->ats_qdep;
5331                 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5332                         sdev->qdep = 0;
5333         }
5334         ret = 0;
5335
5336  out:
5337         spin_unlock(&iommu->lock);
5338         spin_unlock_irqrestore(&device_domain_lock, flags);
5339
5340         return ret;
5341 }
5342
5343 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5344 {
5345         struct intel_iommu *iommu;
5346         u8 bus, devfn;
5347
5348         if (iommu_dummy(dev)) {
5349                 dev_warn(dev,
5350                          "No IOMMU translation for device; cannot enable SVM\n");
5351                 return NULL;
5352         }
5353
5354         iommu = device_to_iommu(dev, &bus, &devfn);
5355         if ((!iommu)) {
5356                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5357                 return NULL;
5358         }
5359
5360         if (!iommu->pasid_table) {
5361                 dev_err(dev, "PASID not enabled on IOMMU; cannot enable SVM\n");
5362                 return NULL;
5363         }
5364
5365         return iommu;
5366 }
5367 #endif /* CONFIG_INTEL_IOMMU_SVM */
5368
5369 const struct iommu_ops intel_iommu_ops = {
5370         .capable                = intel_iommu_capable,
5371         .domain_alloc           = intel_iommu_domain_alloc,
5372         .domain_free            = intel_iommu_domain_free,
5373         .attach_dev             = intel_iommu_attach_device,
5374         .detach_dev             = intel_iommu_detach_device,
5375         .map                    = intel_iommu_map,
5376         .unmap                  = intel_iommu_unmap,
5377         .map_sg                 = default_iommu_map_sg,
5378         .iova_to_phys           = intel_iommu_iova_to_phys,
5379         .add_device             = intel_iommu_add_device,
5380         .remove_device          = intel_iommu_remove_device,
5381         .get_resv_regions       = intel_iommu_get_resv_regions,
5382         .put_resv_regions       = intel_iommu_put_resv_regions,
5383         .device_group           = pci_device_group,
5384         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5385 };
5386
5387 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5388 {
5389         /* G4x/GM45 integrated gfx dmar support is totally busted. */
5390         pr_info("Disabling IOMMU for graphics on this chipset\n");
5391         dmar_map_gfx = 0;
5392 }
5393
5394 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5395 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5396 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5397 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5398 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5399 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5400 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5401
5402 static void quirk_iommu_rwbf(struct pci_dev *dev)
5403 {
5404         /*
5405          * Mobile 4 Series Chipset neglects to set RWBF capability,
5406          * but needs it. Same seems to hold for the desktop versions.
5407          */
5408         pr_info("Forcing write-buffer flush capability\n");
5409         rwbf_quirk = 1;
5410 }
5411
5412 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5413 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5414 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5415 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5416 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5417 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5418 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5419
5420 #define GGC 0x52
5421 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5422 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5423 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5424 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5425 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5426 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5427 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5428 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5429
5430 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5431 {
5432         unsigned short ggc;
5433
5434         if (pci_read_config_word(dev, GGC, &ggc))
5435                 return;
5436
5437         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5438                 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5439                 dmar_map_gfx = 0;
5440         } else if (dmar_map_gfx) {
5441                 /* we have to ensure the gfx device is idle before we flush */
5442                 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5443                 intel_iommu_strict = 1;
5444        }
5445 }
5446 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5447 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5448 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5449 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5450
5451 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5452    ISOCH DMAR unit for the Azalia sound device, but not give it any
5453    TLB entries, which causes it to deadlock. Check for that.  We do
5454    this in a function called from init_dmars(), instead of in a PCI
5455    quirk, because we don't want to print the obnoxious "BIOS broken"
5456    message if VT-d is actually disabled.
5457 */
5458 static void __init check_tylersburg_isoch(void)
5459 {
5460         struct pci_dev *pdev;
5461         uint32_t vtisochctrl;
5462
5463         /* If there's no Azalia in the system anyway, forget it. */
5464         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5465         if (!pdev)
5466                 return;
5467         pci_dev_put(pdev);
5468
5469         /* System Management Registers. Might be hidden, in which case
5470            we can't do the sanity check. But that's OK, because the
5471            known-broken BIOSes _don't_ actually hide it, so far. */
5472         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5473         if (!pdev)
5474                 return;
5475
5476         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5477                 pci_dev_put(pdev);
5478                 return;
5479         }
5480
5481         pci_dev_put(pdev);
5482
5483         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5484         if (vtisochctrl & 1)
5485                 return;
5486
5487         /* Drop all bits other than the number of TLB entries */
5488         vtisochctrl &= 0x1c;
5489
5490         /* If we have the recommended number of TLB entries (16), fine. */
5491         if (vtisochctrl == 0x10)
5492                 return;
5493
5494         /* Zero TLB entries? You get to ride the short bus to school. */
5495         if (!vtisochctrl) {
5496                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5497                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5498                      dmi_get_system_info(DMI_BIOS_VENDOR),
5499                      dmi_get_system_info(DMI_BIOS_VERSION),
5500                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5501                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5502                 return;
5503         }
5504
5505         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5506                vtisochctrl);
5507 }