1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
50 #include "irq_remapping.h"
51 #include "intel-pasid.h"
53 #define ROOT_SIZE VTD_PAGE_SIZE
54 #define CONTEXT_SIZE VTD_PAGE_SIZE
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61 #define IOAPIC_RANGE_START (0xfee00000)
62 #define IOAPIC_RANGE_END (0xfeefffff)
63 #define IOVA_START_ADDR (0x1000)
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN (1)
82 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
84 /* page table handling */
85 #define LEVEL_STRIDE (9)
86 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
89 * This bitmap is used to advertise the page sizes our hardware support
90 * to the IOMMU core, which will then use this information to split
91 * physically contiguous memory regions it is mapping into page sizes
94 * Traditionally the IOMMU core just handed us the mappings directly,
95 * after making sure the size is an order of a 4KiB page and that the
96 * mapping has natural alignment.
98 * To retain this behavior, we currently advertise that we support
99 * all page sizes that are an order of 4KiB.
101 * If at some point we'd like to utilize the IOMMU core's new behavior,
102 * we could change this to advertise the real page sizes we support.
104 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
106 static inline int agaw_to_level(int agaw)
111 static inline int agaw_to_width(int agaw)
113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
116 static inline int width_to_agaw(int width)
118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
121 static inline unsigned int level_to_offset_bits(int level)
123 return (level - 1) * LEVEL_STRIDE;
126 static inline int pfn_level_offset(u64 pfn, int level)
128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
131 static inline u64 level_mask(int level)
133 return -1ULL << level_to_offset_bits(level);
136 static inline u64 level_size(int level)
138 return 1ULL << level_to_offset_bits(level);
141 static inline u64 align_to_level(u64 pfn, int level)
143 return (pfn + level_size(level) - 1) & level_mask(level);
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
148 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152 are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
164 return mm_to_dma_pfn(page_to_pfn(pg));
166 static inline unsigned long virt_to_dma_pfn(void *p)
168 return page_to_dma_pfn(virt_to_page(p));
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
178 * set to 1 to panic kernel if can't successfully enable VT-d
179 * (used when kernel is launched w/ TXT)
181 static int force_on = 0;
182 static int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
196 return re->lo & VTD_PAGE_MASK;
200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
208 return re->hi & VTD_PAGE_MASK;
211 static inline void context_clear_pasid_enable(struct context_entry *context)
213 context->lo &= ~(1ULL << 11);
216 static inline bool context_pasid_enabled(struct context_entry *context)
218 return !!(context->lo & (1ULL << 11));
221 static inline void context_set_copied(struct context_entry *context)
223 context->hi |= (1ull << 3);
226 static inline bool context_copied(struct context_entry *context)
228 return !!(context->hi & (1ULL << 3));
231 static inline bool __context_present(struct context_entry *context)
233 return (context->lo & 1);
236 bool context_present(struct context_entry *context)
238 return context_pasid_enabled(context) ?
239 __context_present(context) :
240 __context_present(context) && !context_copied(context);
243 static inline void context_set_present(struct context_entry *context)
248 static inline void context_set_fault_enable(struct context_entry *context)
250 context->lo &= (((u64)-1) << 2) | 1;
253 static inline void context_set_translation_type(struct context_entry *context,
256 context->lo &= (((u64)-1) << 4) | 3;
257 context->lo |= (value & 3) << 2;
260 static inline void context_set_address_root(struct context_entry *context,
263 context->lo &= ~VTD_PAGE_MASK;
264 context->lo |= value & VTD_PAGE_MASK;
267 static inline void context_set_address_width(struct context_entry *context,
270 context->hi |= value & 7;
273 static inline void context_set_domain_id(struct context_entry *context,
276 context->hi |= (value & ((1 << 16) - 1)) << 8;
279 static inline int context_domain_id(struct context_entry *c)
281 return((c->hi >> 8) & 0xffff);
284 static inline void context_clear_entry(struct context_entry *context)
291 * This domain is a statically identity mapping domain.
292 * 1. This domain creats a static 1:1 mapping to all usable memory.
293 * 2. It maps to each iommu if successful.
294 * 3. Each iommu mapps to this domain if successful.
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
299 /* si_domain contains mulitple devices */
300 #define DOMAIN_FLAG_STATIC_IDENTITY BIT(0)
303 * This is a DMA domain allocated through the iommu domain allocation
304 * interface. But one or more devices belonging to this domain have
305 * been chosen to use a private domain. We should avoid to use the
306 * map/unmap/iova_to_phys APIs on it.
308 #define DOMAIN_FLAG_LOSE_CHILDREN BIT(1)
310 #define for_each_domain_iommu(idx, domain) \
311 for (idx = 0; idx < g_num_of_iommus; idx++) \
312 if (domain->iommu_refcnt[idx])
314 struct dmar_rmrr_unit {
315 struct list_head list; /* list of rmrr units */
316 struct acpi_dmar_header *hdr; /* ACPI header */
317 u64 base_address; /* reserved base address*/
318 u64 end_address; /* reserved end address */
319 struct dmar_dev_scope *devices; /* target devices */
320 int devices_cnt; /* target device count */
323 struct dmar_atsr_unit {
324 struct list_head list; /* list of ATSR units */
325 struct acpi_dmar_header *hdr; /* ACPI header */
326 struct dmar_dev_scope *devices; /* target devices */
327 int devices_cnt; /* target device count */
328 u8 include_all:1; /* include all ports */
331 static LIST_HEAD(dmar_atsr_units);
332 static LIST_HEAD(dmar_rmrr_units);
334 #define for_each_rmrr_units(rmrr) \
335 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
337 /* bitmap for indexing intel_iommus */
338 static int g_num_of_iommus;
340 static void domain_exit(struct dmar_domain *domain);
341 static void domain_remove_dev_info(struct dmar_domain *domain);
342 static void dmar_remove_one_dev_info(struct device *dev);
343 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
344 static void domain_context_clear(struct intel_iommu *iommu,
346 static int domain_detach_iommu(struct dmar_domain *domain,
347 struct intel_iommu *iommu);
348 static bool device_is_rmrr_locked(struct device *dev);
349 static int intel_iommu_attach_device(struct iommu_domain *domain,
351 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
354 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
355 int dmar_disabled = 0;
357 int dmar_disabled = 1;
358 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
361 int intel_iommu_enabled = 0;
362 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
364 static int dmar_map_gfx = 1;
365 static int dmar_forcedac;
366 static int intel_iommu_strict;
367 static int intel_iommu_superpage = 1;
368 static int iommu_identity_mapping;
369 static int intel_no_bounce;
371 #define IDENTMAP_ALL 1
372 #define IDENTMAP_GFX 2
373 #define IDENTMAP_AZALIA 4
375 int intel_iommu_gfx_mapped;
376 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
378 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
379 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
380 static DEFINE_SPINLOCK(device_domain_lock);
381 static LIST_HEAD(device_domain_list);
383 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \
384 to_pci_dev(d)->untrusted)
387 * Iterate over elements in device_domain_list and call the specified
388 * callback @fn against each element.
390 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
391 void *data), void *data)
395 struct device_domain_info *info;
397 spin_lock_irqsave(&device_domain_lock, flags);
398 list_for_each_entry(info, &device_domain_list, global) {
399 ret = fn(info, data);
401 spin_unlock_irqrestore(&device_domain_lock, flags);
405 spin_unlock_irqrestore(&device_domain_lock, flags);
410 const struct iommu_ops intel_iommu_ops;
412 static bool translation_pre_enabled(struct intel_iommu *iommu)
414 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
417 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
419 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
422 static void init_translation_status(struct intel_iommu *iommu)
426 gsts = readl(iommu->reg + DMAR_GSTS_REG);
427 if (gsts & DMA_GSTS_TES)
428 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
431 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
432 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
434 return container_of(dom, struct dmar_domain, domain);
437 static int __init intel_iommu_setup(char *str)
442 if (!strncmp(str, "on", 2)) {
444 pr_info("IOMMU enabled\n");
445 } else if (!strncmp(str, "off", 3)) {
447 no_platform_optin = 1;
448 pr_info("IOMMU disabled\n");
449 } else if (!strncmp(str, "igfx_off", 8)) {
451 pr_info("Disable GFX device mapping\n");
452 } else if (!strncmp(str, "forcedac", 8)) {
453 pr_info("Forcing DAC for PCI devices\n");
455 } else if (!strncmp(str, "strict", 6)) {
456 pr_info("Disable batched IOTLB flush\n");
457 intel_iommu_strict = 1;
458 } else if (!strncmp(str, "sp_off", 6)) {
459 pr_info("Disable supported super page\n");
460 intel_iommu_superpage = 0;
461 } else if (!strncmp(str, "sm_on", 5)) {
462 pr_info("Intel-IOMMU: scalable mode supported\n");
464 } else if (!strncmp(str, "tboot_noforce", 13)) {
466 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
467 intel_iommu_tboot_noforce = 1;
468 } else if (!strncmp(str, "nobounce", 8)) {
469 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
473 str += strcspn(str, ",");
479 __setup("intel_iommu=", intel_iommu_setup);
481 static struct kmem_cache *iommu_domain_cache;
482 static struct kmem_cache *iommu_devinfo_cache;
484 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
486 struct dmar_domain **domains;
489 domains = iommu->domains[idx];
493 return domains[did & 0xff];
496 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
497 struct dmar_domain *domain)
499 struct dmar_domain **domains;
502 if (!iommu->domains[idx]) {
503 size_t size = 256 * sizeof(struct dmar_domain *);
504 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
507 domains = iommu->domains[idx];
508 if (WARN_ON(!domains))
511 domains[did & 0xff] = domain;
514 void *alloc_pgtable_page(int node)
519 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
521 vaddr = page_address(page);
525 void free_pgtable_page(void *vaddr)
527 free_page((unsigned long)vaddr);
530 static inline void *alloc_domain_mem(void)
532 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
535 static void free_domain_mem(void *vaddr)
537 kmem_cache_free(iommu_domain_cache, vaddr);
540 static inline void * alloc_devinfo_mem(void)
542 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
545 static inline void free_devinfo_mem(void *vaddr)
547 kmem_cache_free(iommu_devinfo_cache, vaddr);
550 static inline int domain_type_is_si(struct dmar_domain *domain)
552 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
555 static inline int domain_pfn_supported(struct dmar_domain *domain,
558 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
560 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
563 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
568 sagaw = cap_sagaw(iommu->cap);
569 for (agaw = width_to_agaw(max_gaw);
571 if (test_bit(agaw, &sagaw))
579 * Calculate max SAGAW for each iommu.
581 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
583 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
587 * calculate agaw for each iommu.
588 * "SAGAW" may be different across iommus, use a default agaw, and
589 * get a supported less agaw for iommus that don't support the default agaw.
591 int iommu_calculate_agaw(struct intel_iommu *iommu)
593 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
596 /* This functionin only returns single iommu in a domain */
597 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
601 /* si_domain and vm domain should not get here. */
602 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
605 for_each_domain_iommu(iommu_id, domain)
608 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
611 return g_iommus[iommu_id];
614 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
616 return sm_supported(iommu) ?
617 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
620 static void domain_update_iommu_coherency(struct dmar_domain *domain)
622 struct dmar_drhd_unit *drhd;
623 struct intel_iommu *iommu;
627 domain->iommu_coherency = 1;
629 for_each_domain_iommu(i, domain) {
631 if (!iommu_paging_structure_coherency(g_iommus[i])) {
632 domain->iommu_coherency = 0;
639 /* No hardware attached; use lowest common denominator */
641 for_each_active_iommu(iommu, drhd) {
642 if (!iommu_paging_structure_coherency(iommu)) {
643 domain->iommu_coherency = 0;
650 static int domain_update_iommu_snooping(struct intel_iommu *skip)
652 struct dmar_drhd_unit *drhd;
653 struct intel_iommu *iommu;
657 for_each_active_iommu(iommu, drhd) {
659 if (!ecap_sc_support(iommu->ecap)) {
670 static int domain_update_iommu_superpage(struct intel_iommu *skip)
672 struct dmar_drhd_unit *drhd;
673 struct intel_iommu *iommu;
676 if (!intel_iommu_superpage) {
680 /* set iommu_superpage to the smallest common denominator */
682 for_each_active_iommu(iommu, drhd) {
684 mask &= cap_super_page_val(iommu->cap);
694 /* Some capabilities may be different across iommus */
695 static void domain_update_iommu_cap(struct dmar_domain *domain)
697 domain_update_iommu_coherency(domain);
698 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
699 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
702 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
705 struct root_entry *root = &iommu->root_entry[bus];
706 struct context_entry *context;
710 if (sm_supported(iommu)) {
718 context = phys_to_virt(*entry & VTD_PAGE_MASK);
720 unsigned long phy_addr;
724 context = alloc_pgtable_page(iommu->node);
728 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
729 phy_addr = virt_to_phys((void *)context);
730 *entry = phy_addr | 1;
731 __iommu_flush_cache(iommu, entry, sizeof(*entry));
733 return &context[devfn];
736 static int iommu_dummy(struct device *dev)
738 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
742 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
743 * sub-hierarchy of a candidate PCI-PCI bridge
744 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
745 * @bridge: the candidate PCI-PCI bridge
747 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
750 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
752 struct pci_dev *pdev, *pbridge;
754 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
757 pdev = to_pci_dev(dev);
758 pbridge = to_pci_dev(bridge);
760 if (pbridge->subordinate &&
761 pbridge->subordinate->number <= pdev->bus->number &&
762 pbridge->subordinate->busn_res.end >= pdev->bus->number)
768 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
770 struct dmar_drhd_unit *drhd = NULL;
771 struct intel_iommu *iommu;
773 struct pci_dev *pdev = NULL;
777 if (iommu_dummy(dev))
780 if (dev_is_pci(dev)) {
781 struct pci_dev *pf_pdev;
783 pdev = to_pci_dev(dev);
786 /* VMD child devices currently cannot be handled individually */
787 if (is_vmd(pdev->bus))
791 /* VFs aren't listed in scope tables; we need to look up
792 * the PF instead to find the IOMMU. */
793 pf_pdev = pci_physfn(pdev);
795 segment = pci_domain_nr(pdev->bus);
796 } else if (has_acpi_companion(dev))
797 dev = &ACPI_COMPANION(dev)->dev;
800 for_each_active_iommu(iommu, drhd) {
801 if (pdev && segment != drhd->segment)
804 for_each_active_dev_scope(drhd->devices,
805 drhd->devices_cnt, i, tmp) {
807 /* For a VF use its original BDF# not that of the PF
808 * which we used for the IOMMU lookup. Strictly speaking
809 * we could do this for all PCI devices; we only need to
810 * get the BDF# from the scope table for ACPI matches. */
811 if (pdev && pdev->is_virtfn)
814 *bus = drhd->devices[i].bus;
815 *devfn = drhd->devices[i].devfn;
819 if (is_downstream_to_pci_bridge(dev, tmp))
823 if (pdev && drhd->include_all) {
825 *bus = pdev->bus->number;
826 *devfn = pdev->devfn;
837 static void domain_flush_cache(struct dmar_domain *domain,
838 void *addr, int size)
840 if (!domain->iommu_coherency)
841 clflush_cache_range(addr, size);
844 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
846 struct context_entry *context;
850 spin_lock_irqsave(&iommu->lock, flags);
851 context = iommu_context_addr(iommu, bus, devfn, 0);
853 ret = context_present(context);
854 spin_unlock_irqrestore(&iommu->lock, flags);
858 static void free_context_table(struct intel_iommu *iommu)
862 struct context_entry *context;
864 spin_lock_irqsave(&iommu->lock, flags);
865 if (!iommu->root_entry) {
868 for (i = 0; i < ROOT_ENTRY_NR; i++) {
869 context = iommu_context_addr(iommu, i, 0, 0);
871 free_pgtable_page(context);
873 if (!sm_supported(iommu))
876 context = iommu_context_addr(iommu, i, 0x80, 0);
878 free_pgtable_page(context);
881 free_pgtable_page(iommu->root_entry);
882 iommu->root_entry = NULL;
884 spin_unlock_irqrestore(&iommu->lock, flags);
887 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
888 unsigned long pfn, int *target_level)
890 struct dma_pte *parent, *pte;
891 int level = agaw_to_level(domain->agaw);
894 BUG_ON(!domain->pgd);
896 if (!domain_pfn_supported(domain, pfn))
897 /* Address beyond IOMMU's addressing capabilities. */
900 parent = domain->pgd;
905 offset = pfn_level_offset(pfn, level);
906 pte = &parent[offset];
907 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
909 if (level == *target_level)
912 if (!dma_pte_present(pte)) {
915 tmp_page = alloc_pgtable_page(domain->nid);
920 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
921 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
922 if (cmpxchg64(&pte->val, 0ULL, pteval))
923 /* Someone else set it while we were thinking; use theirs. */
924 free_pgtable_page(tmp_page);
926 domain_flush_cache(domain, pte, sizeof(*pte));
931 parent = phys_to_virt(dma_pte_addr(pte));
936 *target_level = level;
941 /* return address's pte at specific level */
942 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
944 int level, int *large_page)
946 struct dma_pte *parent, *pte;
947 int total = agaw_to_level(domain->agaw);
950 parent = domain->pgd;
951 while (level <= total) {
952 offset = pfn_level_offset(pfn, total);
953 pte = &parent[offset];
957 if (!dma_pte_present(pte)) {
962 if (dma_pte_superpage(pte)) {
967 parent = phys_to_virt(dma_pte_addr(pte));
973 /* clear last level pte, a tlb flush should be followed */
974 static void dma_pte_clear_range(struct dmar_domain *domain,
975 unsigned long start_pfn,
976 unsigned long last_pfn)
978 unsigned int large_page;
979 struct dma_pte *first_pte, *pte;
981 BUG_ON(!domain_pfn_supported(domain, start_pfn));
982 BUG_ON(!domain_pfn_supported(domain, last_pfn));
983 BUG_ON(start_pfn > last_pfn);
985 /* we don't need lock here; nobody else touches the iova range */
988 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
990 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
995 start_pfn += lvl_to_nr_pages(large_page);
997 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
999 domain_flush_cache(domain, first_pte,
1000 (void *)pte - (void *)first_pte);
1002 } while (start_pfn && start_pfn <= last_pfn);
1005 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1006 int retain_level, struct dma_pte *pte,
1007 unsigned long pfn, unsigned long start_pfn,
1008 unsigned long last_pfn)
1010 pfn = max(start_pfn, pfn);
1011 pte = &pte[pfn_level_offset(pfn, level)];
1014 unsigned long level_pfn;
1015 struct dma_pte *level_pte;
1017 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1020 level_pfn = pfn & level_mask(level);
1021 level_pte = phys_to_virt(dma_pte_addr(pte));
1024 dma_pte_free_level(domain, level - 1, retain_level,
1025 level_pte, level_pfn, start_pfn,
1030 * Free the page table if we're below the level we want to
1031 * retain and the range covers the entire table.
1033 if (level < retain_level && !(start_pfn > level_pfn ||
1034 last_pfn < level_pfn + level_size(level) - 1)) {
1036 domain_flush_cache(domain, pte, sizeof(*pte));
1037 free_pgtable_page(level_pte);
1040 pfn += level_size(level);
1041 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1045 * clear last level (leaf) ptes and free page table pages below the
1046 * level we wish to keep intact.
1048 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1049 unsigned long start_pfn,
1050 unsigned long last_pfn,
1053 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1054 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1055 BUG_ON(start_pfn > last_pfn);
1057 dma_pte_clear_range(domain, start_pfn, last_pfn);
1059 /* We don't need lock here; nobody else touches the iova range */
1060 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1061 domain->pgd, 0, start_pfn, last_pfn);
1064 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1065 free_pgtable_page(domain->pgd);
1070 /* When a page at a given level is being unlinked from its parent, we don't
1071 need to *modify* it at all. All we need to do is make a list of all the
1072 pages which can be freed just as soon as we've flushed the IOTLB and we
1073 know the hardware page-walk will no longer touch them.
1074 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1076 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1077 int level, struct dma_pte *pte,
1078 struct page *freelist)
1082 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1083 pg->freelist = freelist;
1089 pte = page_address(pg);
1091 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1092 freelist = dma_pte_list_pagetables(domain, level - 1,
1095 } while (!first_pte_in_page(pte));
1100 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1101 struct dma_pte *pte, unsigned long pfn,
1102 unsigned long start_pfn,
1103 unsigned long last_pfn,
1104 struct page *freelist)
1106 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1108 pfn = max(start_pfn, pfn);
1109 pte = &pte[pfn_level_offset(pfn, level)];
1112 unsigned long level_pfn;
1114 if (!dma_pte_present(pte))
1117 level_pfn = pfn & level_mask(level);
1119 /* If range covers entire pagetable, free it */
1120 if (start_pfn <= level_pfn &&
1121 last_pfn >= level_pfn + level_size(level) - 1) {
1122 /* These suborbinate page tables are going away entirely. Don't
1123 bother to clear them; we're just going to *free* them. */
1124 if (level > 1 && !dma_pte_superpage(pte))
1125 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1131 } else if (level > 1) {
1132 /* Recurse down into a level that isn't *entirely* obsolete */
1133 freelist = dma_pte_clear_level(domain, level - 1,
1134 phys_to_virt(dma_pte_addr(pte)),
1135 level_pfn, start_pfn, last_pfn,
1139 pfn += level_size(level);
1140 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1143 domain_flush_cache(domain, first_pte,
1144 (void *)++last_pte - (void *)first_pte);
1149 /* We can't just free the pages because the IOMMU may still be walking
1150 the page tables, and may have cached the intermediate levels. The
1151 pages can only be freed after the IOTLB flush has been done. */
1152 static struct page *domain_unmap(struct dmar_domain *domain,
1153 unsigned long start_pfn,
1154 unsigned long last_pfn)
1156 struct page *freelist;
1158 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1159 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1160 BUG_ON(start_pfn > last_pfn);
1162 /* we don't need lock here; nobody else touches the iova range */
1163 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1164 domain->pgd, 0, start_pfn, last_pfn, NULL);
1167 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1168 struct page *pgd_page = virt_to_page(domain->pgd);
1169 pgd_page->freelist = freelist;
1170 freelist = pgd_page;
1178 static void dma_free_pagelist(struct page *freelist)
1182 while ((pg = freelist)) {
1183 freelist = pg->freelist;
1184 free_pgtable_page(page_address(pg));
1188 static void iova_entry_free(unsigned long data)
1190 struct page *freelist = (struct page *)data;
1192 dma_free_pagelist(freelist);
1195 /* iommu handling */
1196 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1198 struct root_entry *root;
1199 unsigned long flags;
1201 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1203 pr_err("Allocating root entry for %s failed\n",
1208 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1210 spin_lock_irqsave(&iommu->lock, flags);
1211 iommu->root_entry = root;
1212 spin_unlock_irqrestore(&iommu->lock, flags);
1217 static void iommu_set_root_entry(struct intel_iommu *iommu)
1223 addr = virt_to_phys(iommu->root_entry);
1224 if (sm_supported(iommu))
1225 addr |= DMA_RTADDR_SMT;
1227 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1228 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1230 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1232 /* Make sure hardware complete it */
1233 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1234 readl, (sts & DMA_GSTS_RTPS), sts);
1236 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1239 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1244 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1247 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1248 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1250 /* Make sure hardware complete it */
1251 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1252 readl, (!(val & DMA_GSTS_WBFS)), val);
1254 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1257 /* return value determine if we need a write buffer flush */
1258 static void __iommu_flush_context(struct intel_iommu *iommu,
1259 u16 did, u16 source_id, u8 function_mask,
1266 case DMA_CCMD_GLOBAL_INVL:
1267 val = DMA_CCMD_GLOBAL_INVL;
1269 case DMA_CCMD_DOMAIN_INVL:
1270 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1272 case DMA_CCMD_DEVICE_INVL:
1273 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1274 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1279 val |= DMA_CCMD_ICC;
1281 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1282 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1284 /* Make sure hardware complete it */
1285 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1286 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1288 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1291 /* return value determine if we need a write buffer flush */
1292 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1293 u64 addr, unsigned int size_order, u64 type)
1295 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1296 u64 val = 0, val_iva = 0;
1300 case DMA_TLB_GLOBAL_FLUSH:
1301 /* global flush doesn't need set IVA_REG */
1302 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1304 case DMA_TLB_DSI_FLUSH:
1305 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1307 case DMA_TLB_PSI_FLUSH:
1308 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1309 /* IH bit is passed in as part of address */
1310 val_iva = size_order | addr;
1315 /* Note: set drain read/write */
1318 * This is probably to be super secure.. Looks like we can
1319 * ignore it without any impact.
1321 if (cap_read_drain(iommu->cap))
1322 val |= DMA_TLB_READ_DRAIN;
1324 if (cap_write_drain(iommu->cap))
1325 val |= DMA_TLB_WRITE_DRAIN;
1327 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1328 /* Note: Only uses first TLB reg currently */
1330 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1331 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1333 /* Make sure hardware complete it */
1334 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1335 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1337 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1339 /* check IOTLB invalidation granularity */
1340 if (DMA_TLB_IAIG(val) == 0)
1341 pr_err("Flush IOTLB failed\n");
1342 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1343 pr_debug("TLB flush request %Lx, actual %Lx\n",
1344 (unsigned long long)DMA_TLB_IIRG(type),
1345 (unsigned long long)DMA_TLB_IAIG(val));
1348 static struct device_domain_info *
1349 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1352 struct device_domain_info *info;
1354 assert_spin_locked(&device_domain_lock);
1359 list_for_each_entry(info, &domain->devices, link)
1360 if (info->iommu == iommu && info->bus == bus &&
1361 info->devfn == devfn) {
1362 if (info->ats_supported && info->dev)
1370 static void domain_update_iotlb(struct dmar_domain *domain)
1372 struct device_domain_info *info;
1373 bool has_iotlb_device = false;
1375 assert_spin_locked(&device_domain_lock);
1377 list_for_each_entry(info, &domain->devices, link) {
1378 struct pci_dev *pdev;
1380 if (!info->dev || !dev_is_pci(info->dev))
1383 pdev = to_pci_dev(info->dev);
1384 if (pdev->ats_enabled) {
1385 has_iotlb_device = true;
1390 domain->has_iotlb_device = has_iotlb_device;
1393 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1395 struct pci_dev *pdev;
1397 assert_spin_locked(&device_domain_lock);
1399 if (!info || !dev_is_pci(info->dev))
1402 pdev = to_pci_dev(info->dev);
1403 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1404 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1405 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1406 * reserved, which should be set to 0.
1408 if (!ecap_dit(info->iommu->ecap))
1411 struct pci_dev *pf_pdev;
1413 /* pdev will be returned if device is not a vf */
1414 pf_pdev = pci_physfn(pdev);
1415 info->pfsid = pci_dev_id(pf_pdev);
1418 #ifdef CONFIG_INTEL_IOMMU_SVM
1419 /* The PCIe spec, in its wisdom, declares that the behaviour of
1420 the device if you enable PASID support after ATS support is
1421 undefined. So always enable PASID support on devices which
1422 have it, even if we can't yet know if we're ever going to
1424 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1425 info->pasid_enabled = 1;
1427 if (info->pri_supported &&
1428 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1429 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1430 info->pri_enabled = 1;
1432 if (!pdev->untrusted && info->ats_supported &&
1433 pci_ats_page_aligned(pdev) &&
1434 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1435 info->ats_enabled = 1;
1436 domain_update_iotlb(info->domain);
1437 info->ats_qdep = pci_ats_queue_depth(pdev);
1441 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1443 struct pci_dev *pdev;
1445 assert_spin_locked(&device_domain_lock);
1447 if (!dev_is_pci(info->dev))
1450 pdev = to_pci_dev(info->dev);
1452 if (info->ats_enabled) {
1453 pci_disable_ats(pdev);
1454 info->ats_enabled = 0;
1455 domain_update_iotlb(info->domain);
1457 #ifdef CONFIG_INTEL_IOMMU_SVM
1458 if (info->pri_enabled) {
1459 pci_disable_pri(pdev);
1460 info->pri_enabled = 0;
1462 if (info->pasid_enabled) {
1463 pci_disable_pasid(pdev);
1464 info->pasid_enabled = 0;
1469 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1470 u64 addr, unsigned mask)
1473 unsigned long flags;
1474 struct device_domain_info *info;
1476 if (!domain->has_iotlb_device)
1479 spin_lock_irqsave(&device_domain_lock, flags);
1480 list_for_each_entry(info, &domain->devices, link) {
1481 if (!info->ats_enabled)
1484 sid = info->bus << 8 | info->devfn;
1485 qdep = info->ats_qdep;
1486 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1489 spin_unlock_irqrestore(&device_domain_lock, flags);
1492 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1493 struct dmar_domain *domain,
1494 unsigned long pfn, unsigned int pages,
1497 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1498 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1499 u16 did = domain->iommu_did[iommu->seq_id];
1506 * Fallback to domain selective flush if no PSI support or the size is
1508 * PSI requires page size to be 2 ^ x, and the base address is naturally
1509 * aligned to the size
1511 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1512 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1515 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1519 * In caching mode, changes of pages from non-present to present require
1520 * flush. However, device IOTLB doesn't need to be flushed in this case.
1522 if (!cap_caching_mode(iommu->cap) || !map)
1523 iommu_flush_dev_iotlb(domain, addr, mask);
1526 /* Notification for newly created mappings */
1527 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1528 struct dmar_domain *domain,
1529 unsigned long pfn, unsigned int pages)
1531 /* It's a non-present to present mapping. Only flush if caching mode */
1532 if (cap_caching_mode(iommu->cap))
1533 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1535 iommu_flush_write_buffer(iommu);
1538 static void iommu_flush_iova(struct iova_domain *iovad)
1540 struct dmar_domain *domain;
1543 domain = container_of(iovad, struct dmar_domain, iovad);
1545 for_each_domain_iommu(idx, domain) {
1546 struct intel_iommu *iommu = g_iommus[idx];
1547 u16 did = domain->iommu_did[iommu->seq_id];
1549 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1551 if (!cap_caching_mode(iommu->cap))
1552 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1553 0, MAX_AGAW_PFN_WIDTH);
1557 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1560 unsigned long flags;
1562 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1565 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1566 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1567 pmen &= ~DMA_PMEN_EPM;
1568 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1570 /* wait for the protected region status bit to clear */
1571 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1572 readl, !(pmen & DMA_PMEN_PRS), pmen);
1574 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1577 static void iommu_enable_translation(struct intel_iommu *iommu)
1580 unsigned long flags;
1582 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1583 iommu->gcmd |= DMA_GCMD_TE;
1584 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1586 /* Make sure hardware complete it */
1587 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1588 readl, (sts & DMA_GSTS_TES), sts);
1590 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1593 static void iommu_disable_translation(struct intel_iommu *iommu)
1598 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1599 iommu->gcmd &= ~DMA_GCMD_TE;
1600 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1602 /* Make sure hardware complete it */
1603 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1604 readl, (!(sts & DMA_GSTS_TES)), sts);
1606 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1609 static int iommu_init_domains(struct intel_iommu *iommu)
1611 u32 ndomains, nlongs;
1614 ndomains = cap_ndoms(iommu->cap);
1615 pr_debug("%s: Number of Domains supported <%d>\n",
1616 iommu->name, ndomains);
1617 nlongs = BITS_TO_LONGS(ndomains);
1619 spin_lock_init(&iommu->lock);
1621 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1622 if (!iommu->domain_ids) {
1623 pr_err("%s: Allocating domain id array failed\n",
1628 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1629 iommu->domains = kzalloc(size, GFP_KERNEL);
1631 if (iommu->domains) {
1632 size = 256 * sizeof(struct dmar_domain *);
1633 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1636 if (!iommu->domains || !iommu->domains[0]) {
1637 pr_err("%s: Allocating domain array failed\n",
1639 kfree(iommu->domain_ids);
1640 kfree(iommu->domains);
1641 iommu->domain_ids = NULL;
1642 iommu->domains = NULL;
1647 * If Caching mode is set, then invalid translations are tagged
1648 * with domain-id 0, hence we need to pre-allocate it. We also
1649 * use domain-id 0 as a marker for non-allocated domain-id, so
1650 * make sure it is not used for a real domain.
1652 set_bit(0, iommu->domain_ids);
1655 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1656 * entry for first-level or pass-through translation modes should
1657 * be programmed with a domain id different from those used for
1658 * second-level or nested translation. We reserve a domain id for
1661 if (sm_supported(iommu))
1662 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1667 static void disable_dmar_iommu(struct intel_iommu *iommu)
1669 struct device_domain_info *info, *tmp;
1670 unsigned long flags;
1672 if (!iommu->domains || !iommu->domain_ids)
1675 spin_lock_irqsave(&device_domain_lock, flags);
1676 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1677 if (info->iommu != iommu)
1680 if (!info->dev || !info->domain)
1683 __dmar_remove_one_dev_info(info);
1685 spin_unlock_irqrestore(&device_domain_lock, flags);
1687 if (iommu->gcmd & DMA_GCMD_TE)
1688 iommu_disable_translation(iommu);
1691 static void free_dmar_iommu(struct intel_iommu *iommu)
1693 if ((iommu->domains) && (iommu->domain_ids)) {
1694 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1697 for (i = 0; i < elems; i++)
1698 kfree(iommu->domains[i]);
1699 kfree(iommu->domains);
1700 kfree(iommu->domain_ids);
1701 iommu->domains = NULL;
1702 iommu->domain_ids = NULL;
1705 g_iommus[iommu->seq_id] = NULL;
1707 /* free context mapping */
1708 free_context_table(iommu);
1710 #ifdef CONFIG_INTEL_IOMMU_SVM
1711 if (pasid_supported(iommu)) {
1712 if (ecap_prs(iommu->ecap))
1713 intel_svm_finish_prq(iommu);
1718 static struct dmar_domain *alloc_domain(int flags)
1720 struct dmar_domain *domain;
1722 domain = alloc_domain_mem();
1726 memset(domain, 0, sizeof(*domain));
1727 domain->nid = NUMA_NO_NODE;
1728 domain->flags = flags;
1729 domain->has_iotlb_device = false;
1730 INIT_LIST_HEAD(&domain->devices);
1735 /* Must be called with iommu->lock */
1736 static int domain_attach_iommu(struct dmar_domain *domain,
1737 struct intel_iommu *iommu)
1739 unsigned long ndomains;
1742 assert_spin_locked(&device_domain_lock);
1743 assert_spin_locked(&iommu->lock);
1745 domain->iommu_refcnt[iommu->seq_id] += 1;
1746 domain->iommu_count += 1;
1747 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1748 ndomains = cap_ndoms(iommu->cap);
1749 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1751 if (num >= ndomains) {
1752 pr_err("%s: No free domain ids\n", iommu->name);
1753 domain->iommu_refcnt[iommu->seq_id] -= 1;
1754 domain->iommu_count -= 1;
1758 set_bit(num, iommu->domain_ids);
1759 set_iommu_domain(iommu, num, domain);
1761 domain->iommu_did[iommu->seq_id] = num;
1762 domain->nid = iommu->node;
1764 domain_update_iommu_cap(domain);
1770 static int domain_detach_iommu(struct dmar_domain *domain,
1771 struct intel_iommu *iommu)
1775 assert_spin_locked(&device_domain_lock);
1776 assert_spin_locked(&iommu->lock);
1778 domain->iommu_refcnt[iommu->seq_id] -= 1;
1779 count = --domain->iommu_count;
1780 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1781 num = domain->iommu_did[iommu->seq_id];
1782 clear_bit(num, iommu->domain_ids);
1783 set_iommu_domain(iommu, num, NULL);
1785 domain_update_iommu_cap(domain);
1786 domain->iommu_did[iommu->seq_id] = 0;
1792 static struct iova_domain reserved_iova_list;
1793 static struct lock_class_key reserved_rbtree_key;
1795 static int dmar_init_reserved_ranges(void)
1797 struct pci_dev *pdev = NULL;
1801 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1803 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1804 &reserved_rbtree_key);
1806 /* IOAPIC ranges shouldn't be accessed by DMA */
1807 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1808 IOVA_PFN(IOAPIC_RANGE_END));
1810 pr_err("Reserve IOAPIC range failed\n");
1814 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1815 for_each_pci_dev(pdev) {
1818 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1819 r = &pdev->resource[i];
1820 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1822 iova = reserve_iova(&reserved_iova_list,
1826 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1834 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1836 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1839 static inline int guestwidth_to_adjustwidth(int gaw)
1842 int r = (gaw - 12) % 9;
1853 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1856 int adjust_width, agaw, cap_width;
1857 unsigned long sagaw;
1860 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1862 err = init_iova_flush_queue(&domain->iovad,
1863 iommu_flush_iova, iova_entry_free);
1867 domain_reserve_special_ranges(domain);
1869 /* calculate AGAW */
1870 cap_width = min_t(int, cap_mgaw(iommu->cap), agaw_to_width(iommu->agaw));
1871 if (guest_width > cap_width)
1872 guest_width = cap_width;
1873 domain->gaw = guest_width;
1874 adjust_width = guestwidth_to_adjustwidth(guest_width);
1875 agaw = width_to_agaw(adjust_width);
1876 sagaw = cap_sagaw(iommu->cap);
1877 if (!test_bit(agaw, &sagaw)) {
1878 /* hardware doesn't support it, choose a bigger one */
1879 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1880 agaw = find_next_bit(&sagaw, 5, agaw);
1884 domain->agaw = agaw;
1886 if (ecap_coherent(iommu->ecap))
1887 domain->iommu_coherency = 1;
1889 domain->iommu_coherency = 0;
1891 if (ecap_sc_support(iommu->ecap))
1892 domain->iommu_snooping = 1;
1894 domain->iommu_snooping = 0;
1896 if (intel_iommu_superpage)
1897 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1899 domain->iommu_superpage = 0;
1901 domain->nid = iommu->node;
1903 /* always allocate the top pgd */
1904 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1907 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1911 static void domain_exit(struct dmar_domain *domain)
1914 /* Remove associated devices and clear attached or cached domains */
1915 domain_remove_dev_info(domain);
1918 put_iova_domain(&domain->iovad);
1921 struct page *freelist;
1923 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1924 dma_free_pagelist(freelist);
1927 free_domain_mem(domain);
1931 * Get the PASID directory size for scalable mode context entry.
1932 * Value of X in the PDTS field of a scalable mode context entry
1933 * indicates PASID directory with 2^(X + 7) entries.
1935 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1939 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1940 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1948 * Set the RID_PASID field of a scalable mode context entry. The
1949 * IOMMU hardware will use the PASID value set in this field for
1950 * DMA translations of DMA requests without PASID.
1953 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1955 context->hi |= pasid & ((1 << 20) - 1);
1956 context->hi |= (1 << 20);
1960 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1963 static inline void context_set_sm_dte(struct context_entry *context)
1965 context->lo |= (1 << 2);
1969 * Set the PRE(Page Request Enable) field of a scalable mode context
1972 static inline void context_set_sm_pre(struct context_entry *context)
1974 context->lo |= (1 << 4);
1977 /* Convert value to context PASID directory size field coding. */
1978 #define context_pdts(pds) (((pds) & 0x7) << 9)
1980 static int domain_context_mapping_one(struct dmar_domain *domain,
1981 struct intel_iommu *iommu,
1982 struct pasid_table *table,
1985 u16 did = domain->iommu_did[iommu->seq_id];
1986 int translation = CONTEXT_TT_MULTI_LEVEL;
1987 struct device_domain_info *info = NULL;
1988 struct context_entry *context;
1989 unsigned long flags;
1994 if (hw_pass_through && domain_type_is_si(domain))
1995 translation = CONTEXT_TT_PASS_THROUGH;
1997 pr_debug("Set context mapping for %02x:%02x.%d\n",
1998 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2000 BUG_ON(!domain->pgd);
2002 spin_lock_irqsave(&device_domain_lock, flags);
2003 spin_lock(&iommu->lock);
2006 context = iommu_context_addr(iommu, bus, devfn, 1);
2011 if (context_present(context))
2015 * For kdump cases, old valid entries may be cached due to the
2016 * in-flight DMA and copied pgtable, but there is no unmapping
2017 * behaviour for them, thus we need an explicit cache flush for
2018 * the newly-mapped device. For kdump, at this point, the device
2019 * is supposed to finish reset at its driver probe stage, so no
2020 * in-flight DMA will exist, and we don't need to worry anymore
2023 if (context_copied(context)) {
2024 u16 did_old = context_domain_id(context);
2026 if (did_old < cap_ndoms(iommu->cap)) {
2027 iommu->flush.flush_context(iommu, did_old,
2028 (((u16)bus) << 8) | devfn,
2029 DMA_CCMD_MASK_NOBIT,
2030 DMA_CCMD_DEVICE_INVL);
2031 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2036 context_clear_entry(context);
2038 if (sm_supported(iommu)) {
2043 /* Setup the PASID DIR pointer: */
2044 pds = context_get_sm_pds(table);
2045 context->lo = (u64)virt_to_phys(table->table) |
2048 /* Setup the RID_PASID field: */
2049 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2052 * Setup the Device-TLB enable bit and Page request
2055 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2056 if (info && info->ats_supported)
2057 context_set_sm_dte(context);
2058 if (info && info->pri_supported)
2059 context_set_sm_pre(context);
2061 struct dma_pte *pgd = domain->pgd;
2064 context_set_domain_id(context, did);
2066 if (translation != CONTEXT_TT_PASS_THROUGH) {
2068 * Skip top levels of page tables for iommu which has
2069 * less agaw than default. Unnecessary for PT mode.
2071 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2073 pgd = phys_to_virt(dma_pte_addr(pgd));
2074 if (!dma_pte_present(pgd))
2078 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2079 if (info && info->ats_supported)
2080 translation = CONTEXT_TT_DEV_IOTLB;
2082 translation = CONTEXT_TT_MULTI_LEVEL;
2084 context_set_address_root(context, virt_to_phys(pgd));
2085 context_set_address_width(context, agaw);
2088 * In pass through mode, AW must be programmed to
2089 * indicate the largest AGAW value supported by
2090 * hardware. And ASR is ignored by hardware.
2092 context_set_address_width(context, iommu->msagaw);
2095 context_set_translation_type(context, translation);
2098 context_set_fault_enable(context);
2099 context_set_present(context);
2100 if (!ecap_coherent(iommu->ecap))
2101 clflush_cache_range(context, sizeof(*context));
2104 * It's a non-present to present mapping. If hardware doesn't cache
2105 * non-present entry we only need to flush the write-buffer. If the
2106 * _does_ cache non-present entries, then it does so in the special
2107 * domain #0, which we have to flush:
2109 if (cap_caching_mode(iommu->cap)) {
2110 iommu->flush.flush_context(iommu, 0,
2111 (((u16)bus) << 8) | devfn,
2112 DMA_CCMD_MASK_NOBIT,
2113 DMA_CCMD_DEVICE_INVL);
2114 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2116 iommu_flush_write_buffer(iommu);
2118 iommu_enable_dev_iotlb(info);
2123 spin_unlock(&iommu->lock);
2124 spin_unlock_irqrestore(&device_domain_lock, flags);
2129 struct domain_context_mapping_data {
2130 struct dmar_domain *domain;
2131 struct intel_iommu *iommu;
2132 struct pasid_table *table;
2135 static int domain_context_mapping_cb(struct pci_dev *pdev,
2136 u16 alias, void *opaque)
2138 struct domain_context_mapping_data *data = opaque;
2140 return domain_context_mapping_one(data->domain, data->iommu,
2141 data->table, PCI_BUS_NUM(alias),
2146 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2148 struct domain_context_mapping_data data;
2149 struct pasid_table *table;
2150 struct intel_iommu *iommu;
2153 iommu = device_to_iommu(dev, &bus, &devfn);
2157 table = intel_pasid_get_table(dev);
2159 if (!dev_is_pci(dev))
2160 return domain_context_mapping_one(domain, iommu, table,
2163 data.domain = domain;
2167 return pci_for_each_dma_alias(to_pci_dev(dev),
2168 &domain_context_mapping_cb, &data);
2171 static int domain_context_mapped_cb(struct pci_dev *pdev,
2172 u16 alias, void *opaque)
2174 struct intel_iommu *iommu = opaque;
2176 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2179 static int domain_context_mapped(struct device *dev)
2181 struct intel_iommu *iommu;
2184 iommu = device_to_iommu(dev, &bus, &devfn);
2188 if (!dev_is_pci(dev))
2189 return device_context_mapped(iommu, bus, devfn);
2191 return !pci_for_each_dma_alias(to_pci_dev(dev),
2192 domain_context_mapped_cb, iommu);
2195 /* Returns a number of VTD pages, but aligned to MM page size */
2196 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2199 host_addr &= ~PAGE_MASK;
2200 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2203 /* Return largest possible superpage level for a given mapping */
2204 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2205 unsigned long iov_pfn,
2206 unsigned long phy_pfn,
2207 unsigned long pages)
2209 int support, level = 1;
2210 unsigned long pfnmerge;
2212 support = domain->iommu_superpage;
2214 /* To use a large page, the virtual *and* physical addresses
2215 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2216 of them will mean we have to use smaller pages. So just
2217 merge them and check both at once. */
2218 pfnmerge = iov_pfn | phy_pfn;
2220 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2221 pages >>= VTD_STRIDE_SHIFT;
2224 pfnmerge >>= VTD_STRIDE_SHIFT;
2231 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2232 struct scatterlist *sg, unsigned long phys_pfn,
2233 unsigned long nr_pages, int prot)
2235 struct dma_pte *first_pte = NULL, *pte = NULL;
2236 phys_addr_t uninitialized_var(pteval);
2237 unsigned long sg_res = 0;
2238 unsigned int largepage_lvl = 0;
2239 unsigned long lvl_pages = 0;
2241 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2243 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2246 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2250 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2253 while (nr_pages > 0) {
2257 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2259 sg_res = aligned_nrpages(sg->offset, sg->length);
2260 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2261 sg->dma_length = sg->length;
2262 pteval = (sg_phys(sg) - pgoff) | prot;
2263 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2267 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2269 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2272 /* It is large page*/
2273 if (largepage_lvl > 1) {
2274 unsigned long nr_superpages, end_pfn;
2276 pteval |= DMA_PTE_LARGE_PAGE;
2277 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2279 nr_superpages = sg_res / lvl_pages;
2280 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2283 * Ensure that old small page tables are
2284 * removed to make room for superpage(s).
2285 * We're adding new large pages, so make sure
2286 * we don't remove their parent tables.
2288 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2291 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2295 /* We don't need lock here, nobody else
2296 * touches the iova range
2298 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2300 static int dumps = 5;
2301 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2302 iov_pfn, tmp, (unsigned long long)pteval);
2305 debug_dma_dump_mappings(NULL);
2310 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2312 BUG_ON(nr_pages < lvl_pages);
2313 BUG_ON(sg_res < lvl_pages);
2315 nr_pages -= lvl_pages;
2316 iov_pfn += lvl_pages;
2317 phys_pfn += lvl_pages;
2318 pteval += lvl_pages * VTD_PAGE_SIZE;
2319 sg_res -= lvl_pages;
2321 /* If the next PTE would be the first in a new page, then we
2322 need to flush the cache on the entries we've just written.
2323 And then we'll need to recalculate 'pte', so clear it and
2324 let it get set again in the if (!pte) block above.
2326 If we're done (!nr_pages) we need to flush the cache too.
2328 Also if we've been setting superpages, we may need to
2329 recalculate 'pte' and switch back to smaller pages for the
2330 end of the mapping, if the trailing size is not enough to
2331 use another superpage (i.e. sg_res < lvl_pages). */
2333 if (!nr_pages || first_pte_in_page(pte) ||
2334 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2335 domain_flush_cache(domain, first_pte,
2336 (void *)pte - (void *)first_pte);
2340 if (!sg_res && nr_pages)
2346 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2347 struct scatterlist *sg, unsigned long phys_pfn,
2348 unsigned long nr_pages, int prot)
2351 struct intel_iommu *iommu;
2353 /* Do the real mapping first */
2354 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2358 for_each_domain_iommu(iommu_id, domain) {
2359 iommu = g_iommus[iommu_id];
2360 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2366 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2367 struct scatterlist *sg, unsigned long nr_pages,
2370 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2373 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2374 unsigned long phys_pfn, unsigned long nr_pages,
2377 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2380 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2382 unsigned long flags;
2383 struct context_entry *context;
2389 spin_lock_irqsave(&iommu->lock, flags);
2390 context = iommu_context_addr(iommu, bus, devfn, 0);
2392 spin_unlock_irqrestore(&iommu->lock, flags);
2395 did_old = context_domain_id(context);
2396 context_clear_entry(context);
2397 __iommu_flush_cache(iommu, context, sizeof(*context));
2398 spin_unlock_irqrestore(&iommu->lock, flags);
2399 iommu->flush.flush_context(iommu,
2401 (((u16)bus) << 8) | devfn,
2402 DMA_CCMD_MASK_NOBIT,
2403 DMA_CCMD_DEVICE_INVL);
2404 iommu->flush.flush_iotlb(iommu,
2411 static inline void unlink_domain_info(struct device_domain_info *info)
2413 assert_spin_locked(&device_domain_lock);
2414 list_del(&info->link);
2415 list_del(&info->global);
2417 info->dev->archdata.iommu = NULL;
2420 static void domain_remove_dev_info(struct dmar_domain *domain)
2422 struct device_domain_info *info, *tmp;
2423 unsigned long flags;
2425 spin_lock_irqsave(&device_domain_lock, flags);
2426 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2427 __dmar_remove_one_dev_info(info);
2428 spin_unlock_irqrestore(&device_domain_lock, flags);
2433 * Note: we use struct device->archdata.iommu stores the info
2435 static struct dmar_domain *find_domain(struct device *dev)
2437 struct device_domain_info *info;
2439 if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2440 struct iommu_domain *domain;
2442 dev->archdata.iommu = NULL;
2443 domain = iommu_get_domain_for_dev(dev);
2445 intel_iommu_attach_device(domain, dev);
2448 /* No lock here, assumes no domain exit in normal case */
2449 info = dev->archdata.iommu;
2452 return info->domain;
2456 static inline struct device_domain_info *
2457 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2459 struct device_domain_info *info;
2461 list_for_each_entry(info, &device_domain_list, global)
2462 if (info->iommu->segment == segment && info->bus == bus &&
2463 info->devfn == devfn)
2469 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2472 struct dmar_domain *domain)
2474 struct dmar_domain *found = NULL;
2475 struct device_domain_info *info;
2476 unsigned long flags;
2479 info = alloc_devinfo_mem();
2484 info->devfn = devfn;
2485 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2486 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2489 info->domain = domain;
2490 info->iommu = iommu;
2491 info->pasid_table = NULL;
2492 info->auxd_enabled = 0;
2493 INIT_LIST_HEAD(&info->auxiliary_domains);
2495 if (dev && dev_is_pci(dev)) {
2496 struct pci_dev *pdev = to_pci_dev(info->dev);
2498 if (!pdev->untrusted &&
2499 !pci_ats_disabled() &&
2500 ecap_dev_iotlb_support(iommu->ecap) &&
2501 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2502 dmar_find_matched_atsr_unit(pdev))
2503 info->ats_supported = 1;
2505 if (sm_supported(iommu)) {
2506 if (pasid_supported(iommu)) {
2507 int features = pci_pasid_features(pdev);
2509 info->pasid_supported = features | 1;
2512 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2513 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2514 info->pri_supported = 1;
2518 spin_lock_irqsave(&device_domain_lock, flags);
2520 found = find_domain(dev);
2523 struct device_domain_info *info2;
2524 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2526 found = info2->domain;
2532 spin_unlock_irqrestore(&device_domain_lock, flags);
2533 free_devinfo_mem(info);
2534 /* Caller must free the original domain */
2538 spin_lock(&iommu->lock);
2539 ret = domain_attach_iommu(domain, iommu);
2540 spin_unlock(&iommu->lock);
2543 spin_unlock_irqrestore(&device_domain_lock, flags);
2544 free_devinfo_mem(info);
2548 list_add(&info->link, &domain->devices);
2549 list_add(&info->global, &device_domain_list);
2551 dev->archdata.iommu = info;
2552 spin_unlock_irqrestore(&device_domain_lock, flags);
2554 /* PASID table is mandatory for a PCI device in scalable mode. */
2555 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2556 ret = intel_pasid_alloc_table(dev);
2558 dev_err(dev, "PASID table allocation failed\n");
2559 dmar_remove_one_dev_info(dev);
2563 /* Setup the PASID entry for requests without PASID: */
2564 spin_lock_irqsave(&iommu->lock, flags);
2565 if (hw_pass_through && domain_type_is_si(domain))
2566 ret = intel_pasid_setup_pass_through(iommu, domain,
2567 dev, PASID_RID2PASID);
2569 ret = intel_pasid_setup_second_level(iommu, domain,
2570 dev, PASID_RID2PASID);
2571 spin_unlock_irqrestore(&iommu->lock, flags);
2573 dev_err(dev, "Setup RID2PASID failed\n");
2574 dmar_remove_one_dev_info(dev);
2579 if (dev && domain_context_mapping(domain, dev)) {
2580 dev_err(dev, "Domain context map failed\n");
2581 dmar_remove_one_dev_info(dev);
2588 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2590 *(u16 *)opaque = alias;
2594 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2596 struct device_domain_info *info;
2597 struct dmar_domain *domain = NULL;
2598 struct intel_iommu *iommu;
2600 unsigned long flags;
2603 iommu = device_to_iommu(dev, &bus, &devfn);
2607 if (dev_is_pci(dev)) {
2608 struct pci_dev *pdev = to_pci_dev(dev);
2610 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2612 spin_lock_irqsave(&device_domain_lock, flags);
2613 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2614 PCI_BUS_NUM(dma_alias),
2617 iommu = info->iommu;
2618 domain = info->domain;
2620 spin_unlock_irqrestore(&device_domain_lock, flags);
2622 /* DMA alias already has a domain, use it */
2627 /* Allocate and initialize new domain for the device */
2628 domain = alloc_domain(0);
2631 if (domain_init(domain, iommu, gaw)) {
2632 domain_exit(domain);
2640 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2641 struct dmar_domain *domain)
2643 struct intel_iommu *iommu;
2644 struct dmar_domain *tmp;
2645 u16 req_id, dma_alias;
2648 iommu = device_to_iommu(dev, &bus, &devfn);
2652 req_id = ((u16)bus << 8) | devfn;
2654 if (dev_is_pci(dev)) {
2655 struct pci_dev *pdev = to_pci_dev(dev);
2657 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2659 /* register PCI DMA alias device */
2660 if (req_id != dma_alias) {
2661 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2662 dma_alias & 0xff, NULL, domain);
2664 if (!tmp || tmp != domain)
2669 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2670 if (!tmp || tmp != domain)
2676 static int iommu_domain_identity_map(struct dmar_domain *domain,
2677 unsigned long long start,
2678 unsigned long long end)
2680 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2681 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2683 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2684 dma_to_mm_pfn(last_vpfn))) {
2685 pr_err("Reserving iova failed\n");
2689 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2691 * RMRR range might have overlap with physical memory range,
2694 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2696 return __domain_mapping(domain, first_vpfn, NULL,
2697 first_vpfn, last_vpfn - first_vpfn + 1,
2698 DMA_PTE_READ|DMA_PTE_WRITE);
2701 static int domain_prepare_identity_map(struct device *dev,
2702 struct dmar_domain *domain,
2703 unsigned long long start,
2704 unsigned long long end)
2706 /* For _hardware_ passthrough, don't bother. But for software
2707 passthrough, we do it anyway -- it may indicate a memory
2708 range which is reserved in E820, so which didn't get set
2709 up to start with in si_domain */
2710 if (domain == si_domain && hw_pass_through) {
2711 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2716 dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2719 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2720 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2721 dmi_get_system_info(DMI_BIOS_VENDOR),
2722 dmi_get_system_info(DMI_BIOS_VERSION),
2723 dmi_get_system_info(DMI_PRODUCT_VERSION));
2727 if (end >> agaw_to_width(domain->agaw)) {
2728 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2729 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2730 agaw_to_width(domain->agaw),
2731 dmi_get_system_info(DMI_BIOS_VENDOR),
2732 dmi_get_system_info(DMI_BIOS_VERSION),
2733 dmi_get_system_info(DMI_PRODUCT_VERSION));
2737 return iommu_domain_identity_map(domain, start, end);
2740 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2742 static int __init si_domain_init(int hw)
2744 struct dmar_rmrr_unit *rmrr;
2748 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2752 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2753 domain_exit(si_domain);
2761 for_each_online_node(nid) {
2762 unsigned long start_pfn, end_pfn;
2765 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2766 ret = iommu_domain_identity_map(si_domain,
2767 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2774 * Identity map the RMRRs so that devices with RMRRs could also use
2777 for_each_rmrr_units(rmrr) {
2778 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2780 unsigned long long start = rmrr->base_address;
2781 unsigned long long end = rmrr->end_address;
2783 if (WARN_ON(end < start ||
2784 end >> agaw_to_width(si_domain->agaw)))
2787 ret = iommu_domain_identity_map(si_domain, start, end);
2796 static int identity_mapping(struct device *dev)
2798 struct device_domain_info *info;
2800 info = dev->archdata.iommu;
2801 if (info && info != DUMMY_DEVICE_DOMAIN_INFO && info != DEFER_DEVICE_DOMAIN_INFO)
2802 return (info->domain == si_domain);
2807 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2809 struct dmar_domain *ndomain;
2810 struct intel_iommu *iommu;
2813 iommu = device_to_iommu(dev, &bus, &devfn);
2817 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2818 if (ndomain != domain)
2824 static bool device_has_rmrr(struct device *dev)
2826 struct dmar_rmrr_unit *rmrr;
2831 for_each_rmrr_units(rmrr) {
2833 * Return TRUE if this RMRR contains the device that
2836 for_each_active_dev_scope(rmrr->devices,
2837 rmrr->devices_cnt, i, tmp)
2839 is_downstream_to_pci_bridge(dev, tmp)) {
2849 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2850 * is relaxable (ie. is allowed to be not enforced under some conditions)
2851 * @dev: device handle
2853 * We assume that PCI USB devices with RMRRs have them largely
2854 * for historical reasons and that the RMRR space is not actively used post
2855 * boot. This exclusion may change if vendors begin to abuse it.
2857 * The same exception is made for graphics devices, with the requirement that
2858 * any use of the RMRR regions will be torn down before assigning the device
2861 * Return: true if the RMRR is relaxable, false otherwise
2863 static bool device_rmrr_is_relaxable(struct device *dev)
2865 struct pci_dev *pdev;
2867 if (!dev_is_pci(dev))
2870 pdev = to_pci_dev(dev);
2871 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2878 * There are a couple cases where we need to restrict the functionality of
2879 * devices associated with RMRRs. The first is when evaluating a device for
2880 * identity mapping because problems exist when devices are moved in and out
2881 * of domains and their respective RMRR information is lost. This means that
2882 * a device with associated RMRRs will never be in a "passthrough" domain.
2883 * The second is use of the device through the IOMMU API. This interface
2884 * expects to have full control of the IOVA space for the device. We cannot
2885 * satisfy both the requirement that RMRR access is maintained and have an
2886 * unencumbered IOVA space. We also have no ability to quiesce the device's
2887 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2888 * We therefore prevent devices associated with an RMRR from participating in
2889 * the IOMMU API, which eliminates them from device assignment.
2891 * In both cases, devices which have relaxable RMRRs are not concerned by this
2892 * restriction. See device_rmrr_is_relaxable comment.
2894 static bool device_is_rmrr_locked(struct device *dev)
2896 if (!device_has_rmrr(dev))
2899 if (device_rmrr_is_relaxable(dev))
2906 * Return the required default domain type for a specific device.
2908 * @dev: the device in query
2909 * @startup: true if this is during early boot
2912 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2913 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2914 * - 0: both identity and dynamic domains work for this device
2916 static int device_def_domain_type(struct device *dev)
2918 if (dev_is_pci(dev)) {
2919 struct pci_dev *pdev = to_pci_dev(dev);
2922 * Prevent any device marked as untrusted from getting
2923 * placed into the statically identity mapping domain.
2925 if (pdev->untrusted)
2926 return IOMMU_DOMAIN_DMA;
2928 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2929 return IOMMU_DOMAIN_IDENTITY;
2931 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2932 return IOMMU_DOMAIN_IDENTITY;
2935 * We want to start off with all devices in the 1:1 domain, and
2936 * take them out later if we find they can't access all of memory.
2938 * However, we can't do this for PCI devices behind bridges,
2939 * because all PCI devices behind the same bridge will end up
2940 * with the same source-id on their transactions.
2942 * Practically speaking, we can't change things around for these
2943 * devices at run-time, because we can't be sure there'll be no
2944 * DMA transactions in flight for any of their siblings.
2946 * So PCI devices (unless they're on the root bus) as well as
2947 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2948 * the 1:1 domain, just in _case_ one of their siblings turns out
2949 * not to be able to map all of memory.
2951 if (!pci_is_pcie(pdev)) {
2952 if (!pci_is_root_bus(pdev->bus))
2953 return IOMMU_DOMAIN_DMA;
2954 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2955 return IOMMU_DOMAIN_DMA;
2956 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2957 return IOMMU_DOMAIN_DMA;
2960 return (iommu_identity_mapping & IDENTMAP_ALL) ?
2961 IOMMU_DOMAIN_IDENTITY : 0;
2964 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2967 * Start from the sane iommu hardware state.
2968 * If the queued invalidation is already initialized by us
2969 * (for example, while enabling interrupt-remapping) then
2970 * we got the things already rolling from a sane state.
2974 * Clear any previous faults.
2976 dmar_fault(-1, iommu);
2978 * Disable queued invalidation if supported and already enabled
2979 * before OS handover.
2981 dmar_disable_qi(iommu);
2984 if (dmar_enable_qi(iommu)) {
2986 * Queued Invalidate not enabled, use Register Based Invalidate
2988 iommu->flush.flush_context = __iommu_flush_context;
2989 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2990 pr_info("%s: Using Register based invalidation\n",
2993 iommu->flush.flush_context = qi_flush_context;
2994 iommu->flush.flush_iotlb = qi_flush_iotlb;
2995 pr_info("%s: Using Queued invalidation\n", iommu->name);
2999 static int copy_context_table(struct intel_iommu *iommu,
3000 struct root_entry *old_re,
3001 struct context_entry **tbl,
3004 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3005 struct context_entry *new_ce = NULL, ce;
3006 struct context_entry *old_ce = NULL;
3007 struct root_entry re;
3008 phys_addr_t old_ce_phys;
3010 tbl_idx = ext ? bus * 2 : bus;
3011 memcpy(&re, old_re, sizeof(re));
3013 for (devfn = 0; devfn < 256; devfn++) {
3014 /* First calculate the correct index */
3015 idx = (ext ? devfn * 2 : devfn) % 256;
3018 /* First save what we may have and clean up */
3020 tbl[tbl_idx] = new_ce;
3021 __iommu_flush_cache(iommu, new_ce,
3031 old_ce_phys = root_entry_lctp(&re);
3033 old_ce_phys = root_entry_uctp(&re);
3036 if (ext && devfn == 0) {
3037 /* No LCTP, try UCTP */
3046 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3051 new_ce = alloc_pgtable_page(iommu->node);
3058 /* Now copy the context entry */
3059 memcpy(&ce, old_ce + idx, sizeof(ce));
3061 if (!__context_present(&ce))
3064 did = context_domain_id(&ce);
3065 if (did >= 0 && did < cap_ndoms(iommu->cap))
3066 set_bit(did, iommu->domain_ids);
3069 * We need a marker for copied context entries. This
3070 * marker needs to work for the old format as well as
3071 * for extended context entries.
3073 * Bit 67 of the context entry is used. In the old
3074 * format this bit is available to software, in the
3075 * extended format it is the PGE bit, but PGE is ignored
3076 * by HW if PASIDs are disabled (and thus still
3079 * So disable PASIDs first and then mark the entry
3080 * copied. This means that we don't copy PASID
3081 * translations from the old kernel, but this is fine as
3082 * faults there are not fatal.
3084 context_clear_pasid_enable(&ce);
3085 context_set_copied(&ce);
3090 tbl[tbl_idx + pos] = new_ce;
3092 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3101 static int copy_translation_tables(struct intel_iommu *iommu)
3103 struct context_entry **ctxt_tbls;
3104 struct root_entry *old_rt;
3105 phys_addr_t old_rt_phys;
3106 int ctxt_table_entries;
3107 unsigned long flags;
3112 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3113 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3114 new_ext = !!ecap_ecs(iommu->ecap);
3117 * The RTT bit can only be changed when translation is disabled,
3118 * but disabling translation means to open a window for data
3119 * corruption. So bail out and don't copy anything if we would
3120 * have to change the bit.
3125 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3129 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3133 /* This is too big for the stack - allocate it from slab */
3134 ctxt_table_entries = ext ? 512 : 256;
3136 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3140 for (bus = 0; bus < 256; bus++) {
3141 ret = copy_context_table(iommu, &old_rt[bus],
3142 ctxt_tbls, bus, ext);
3144 pr_err("%s: Failed to copy context table for bus %d\n",
3150 spin_lock_irqsave(&iommu->lock, flags);
3152 /* Context tables are copied, now write them to the root_entry table */
3153 for (bus = 0; bus < 256; bus++) {
3154 int idx = ext ? bus * 2 : bus;
3157 if (ctxt_tbls[idx]) {
3158 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3159 iommu->root_entry[bus].lo = val;
3162 if (!ext || !ctxt_tbls[idx + 1])
3165 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3166 iommu->root_entry[bus].hi = val;
3169 spin_unlock_irqrestore(&iommu->lock, flags);
3173 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3183 static int __init init_dmars(void)
3185 struct dmar_drhd_unit *drhd;
3186 struct intel_iommu *iommu;
3192 * initialize and program root entry to not present
3195 for_each_drhd_unit(drhd) {
3197 * lock not needed as this is only incremented in the single
3198 * threaded kernel __init code path all other access are read
3201 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3205 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3208 /* Preallocate enough resources for IOMMU hot-addition */
3209 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3210 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3212 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3215 pr_err("Allocating global iommu array failed\n");
3220 for_each_iommu(iommu, drhd) {
3221 if (drhd->ignored) {
3222 iommu_disable_translation(iommu);
3227 * Find the max pasid size of all IOMMU's in the system.
3228 * We need to ensure the system pasid table is no bigger
3229 * than the smallest supported.
3231 if (pasid_supported(iommu)) {
3232 u32 temp = 2 << ecap_pss(iommu->ecap);
3234 intel_pasid_max_id = min_t(u32, temp,
3235 intel_pasid_max_id);
3238 g_iommus[iommu->seq_id] = iommu;
3240 intel_iommu_init_qi(iommu);
3242 ret = iommu_init_domains(iommu);
3246 init_translation_status(iommu);
3248 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3249 iommu_disable_translation(iommu);
3250 clear_translation_pre_enabled(iommu);
3251 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3257 * we could share the same root & context tables
3258 * among all IOMMU's. Need to Split it later.
3260 ret = iommu_alloc_root_entry(iommu);
3264 if (translation_pre_enabled(iommu)) {
3265 pr_info("Translation already enabled - trying to copy translation structures\n");
3267 ret = copy_translation_tables(iommu);
3270 * We found the IOMMU with translation
3271 * enabled - but failed to copy over the
3272 * old root-entry table. Try to proceed
3273 * by disabling translation now and
3274 * allocating a clean root-entry table.
3275 * This might cause DMAR faults, but
3276 * probably the dump will still succeed.
3278 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3280 iommu_disable_translation(iommu);
3281 clear_translation_pre_enabled(iommu);
3283 pr_info("Copied translation tables from previous kernel for %s\n",
3288 if (!ecap_pass_through(iommu->ecap))
3289 hw_pass_through = 0;
3291 if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) {
3292 pr_info("Disable batched IOTLB flush due to virtualization");
3293 intel_iommu_strict = 1;
3296 #ifdef CONFIG_INTEL_IOMMU_SVM
3297 if (pasid_supported(iommu))
3298 intel_svm_init(iommu);
3303 * Now that qi is enabled on all iommus, set the root entry and flush
3304 * caches. This is required on some Intel X58 chipsets, otherwise the
3305 * flush_context function will loop forever and the boot hangs.
3307 for_each_active_iommu(iommu, drhd) {
3308 iommu_flush_write_buffer(iommu);
3309 iommu_set_root_entry(iommu);
3310 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3311 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3314 if (iommu_default_passthrough())
3315 iommu_identity_mapping |= IDENTMAP_ALL;
3317 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3322 iommu_identity_mapping |= IDENTMAP_GFX;
3324 check_tylersburg_isoch();
3326 ret = si_domain_init(hw_pass_through);
3333 * global invalidate context cache
3334 * global invalidate iotlb
3335 * enable translation
3337 for_each_iommu(iommu, drhd) {
3338 if (drhd->ignored) {
3340 * we always have to disable PMRs or DMA may fail on
3344 iommu_disable_protect_mem_regions(iommu);
3348 iommu_flush_write_buffer(iommu);
3350 #ifdef CONFIG_INTEL_IOMMU_SVM
3351 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3353 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3354 * could cause possible lock race condition.
3356 up_write(&dmar_global_lock);
3357 ret = intel_svm_enable_prq(iommu);
3358 down_write(&dmar_global_lock);
3363 ret = dmar_set_interrupt(iommu);
3371 for_each_active_iommu(iommu, drhd) {
3372 disable_dmar_iommu(iommu);
3373 free_dmar_iommu(iommu);
3376 domain_exit(si_domain);
3386 /* This takes a number of _MM_ pages, not VTD pages */
3387 static unsigned long intel_alloc_iova(struct device *dev,
3388 struct dmar_domain *domain,
3389 unsigned long nrpages, uint64_t dma_mask)
3391 unsigned long iova_pfn;
3393 /* Restrict dma_mask to the width that the iommu can handle */
3394 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3395 /* Ensure we reserve the whole size-aligned region */
3396 nrpages = __roundup_pow_of_two(nrpages);
3398 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3400 * First try to allocate an io virtual address in
3401 * DMA_BIT_MASK(32) and if that fails then try allocating
3404 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3405 IOVA_PFN(DMA_BIT_MASK(32)), false);
3409 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3410 IOVA_PFN(dma_mask), true);
3411 if (unlikely(!iova_pfn)) {
3412 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3420 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3422 struct dmar_domain *domain, *tmp;
3423 struct dmar_rmrr_unit *rmrr;
3424 struct device *i_dev;
3427 /* Device shouldn't be attached by any domains. */
3428 domain = find_domain(dev);
3432 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3436 /* We have a new domain - setup possible RMRRs for the device */
3438 for_each_rmrr_units(rmrr) {
3439 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3444 ret = domain_prepare_identity_map(dev, domain,
3448 dev_err(dev, "Mapping reserved region failed\n");
3453 tmp = set_domain_for_dev(dev, domain);
3454 if (!tmp || domain != tmp) {
3455 domain_exit(domain);
3461 dev_err(dev, "Allocating domain failed\n");
3463 domain->domain.type = IOMMU_DOMAIN_DMA;
3468 /* Check if the dev needs to go through non-identity map and unmap process.*/
3469 static bool iommu_need_mapping(struct device *dev)
3473 if (iommu_dummy(dev))
3476 ret = identity_mapping(dev);
3478 u64 dma_mask = *dev->dma_mask;
3480 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3481 dma_mask = dev->coherent_dma_mask;
3483 if (dma_mask >= dma_direct_get_required_mask(dev))
3487 * 32 bit DMA is removed from si_domain and fall back to
3488 * non-identity mapping.
3490 dmar_remove_one_dev_info(dev);
3491 ret = iommu_request_dma_domain_for_dev(dev);
3493 struct iommu_domain *domain;
3494 struct dmar_domain *dmar_domain;
3496 domain = iommu_get_domain_for_dev(dev);
3498 dmar_domain = to_dmar_domain(domain);
3499 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3501 dmar_remove_one_dev_info(dev);
3502 get_private_domain_for_dev(dev);
3505 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3511 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3512 size_t size, int dir, u64 dma_mask)
3514 struct dmar_domain *domain;
3515 phys_addr_t start_paddr;
3516 unsigned long iova_pfn;
3519 struct intel_iommu *iommu;
3520 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3522 BUG_ON(dir == DMA_NONE);
3524 domain = find_domain(dev);
3526 return DMA_MAPPING_ERROR;
3528 iommu = domain_get_iommu(domain);
3529 size = aligned_nrpages(paddr, size);
3531 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3536 * Check if DMAR supports zero-length reads on write only
3539 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3540 !cap_zlr(iommu->cap))
3541 prot |= DMA_PTE_READ;
3542 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3543 prot |= DMA_PTE_WRITE;
3545 * paddr - (paddr + size) might be partial page, we should map the whole
3546 * page. Note: if two part of one page are separately mapped, we
3547 * might have two guest_addr mapping to the same host paddr, but this
3548 * is not a big problem
3550 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3551 mm_to_dma_pfn(paddr_pfn), size, prot);
3555 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3556 start_paddr += paddr & ~PAGE_MASK;
3558 trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3564 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3565 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3566 size, (unsigned long long)paddr, dir);
3567 return DMA_MAPPING_ERROR;
3570 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3571 unsigned long offset, size_t size,
3572 enum dma_data_direction dir,
3573 unsigned long attrs)
3575 if (iommu_need_mapping(dev))
3576 return __intel_map_single(dev, page_to_phys(page) + offset,
3577 size, dir, *dev->dma_mask);
3578 return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3581 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3582 size_t size, enum dma_data_direction dir,
3583 unsigned long attrs)
3585 if (iommu_need_mapping(dev))
3586 return __intel_map_single(dev, phys_addr, size, dir,
3588 return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3591 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3593 struct dmar_domain *domain;
3594 unsigned long start_pfn, last_pfn;
3595 unsigned long nrpages;
3596 unsigned long iova_pfn;
3597 struct intel_iommu *iommu;
3598 struct page *freelist;
3599 struct pci_dev *pdev = NULL;
3601 domain = find_domain(dev);
3604 iommu = domain_get_iommu(domain);
3606 iova_pfn = IOVA_PFN(dev_addr);
3608 nrpages = aligned_nrpages(dev_addr, size);
3609 start_pfn = mm_to_dma_pfn(iova_pfn);
3610 last_pfn = start_pfn + nrpages - 1;
3612 if (dev_is_pci(dev))
3613 pdev = to_pci_dev(dev);
3615 freelist = domain_unmap(domain, start_pfn, last_pfn);
3616 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3617 !has_iova_flush_queue(&domain->iovad)) {
3618 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3619 nrpages, !freelist, 0);
3621 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3622 dma_free_pagelist(freelist);
3624 queue_iova(&domain->iovad, iova_pfn, nrpages,
3625 (unsigned long)freelist);
3627 * queue up the release of the unmap to save the 1/6th of the
3628 * cpu used up by the iotlb flush operation...
3632 trace_unmap_single(dev, dev_addr, size);
3635 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3636 size_t size, enum dma_data_direction dir,
3637 unsigned long attrs)
3639 if (iommu_need_mapping(dev))
3640 intel_unmap(dev, dev_addr, size);
3642 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3645 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3646 size_t size, enum dma_data_direction dir, unsigned long attrs)
3648 if (iommu_need_mapping(dev))
3649 intel_unmap(dev, dev_addr, size);
3652 static void *intel_alloc_coherent(struct device *dev, size_t size,
3653 dma_addr_t *dma_handle, gfp_t flags,
3654 unsigned long attrs)
3656 struct page *page = NULL;
3659 if (!iommu_need_mapping(dev))
3660 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3662 size = PAGE_ALIGN(size);
3663 order = get_order(size);
3665 if (gfpflags_allow_blocking(flags)) {
3666 unsigned int count = size >> PAGE_SHIFT;
3668 page = dma_alloc_from_contiguous(dev, count, order,
3669 flags & __GFP_NOWARN);
3673 page = alloc_pages(flags, order);
3676 memset(page_address(page), 0, size);
3678 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3680 dev->coherent_dma_mask);
3681 if (*dma_handle != DMA_MAPPING_ERROR)
3682 return page_address(page);
3683 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3684 __free_pages(page, order);
3689 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3690 dma_addr_t dma_handle, unsigned long attrs)
3693 struct page *page = virt_to_page(vaddr);
3695 if (!iommu_need_mapping(dev))
3696 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3698 size = PAGE_ALIGN(size);
3699 order = get_order(size);
3701 intel_unmap(dev, dma_handle, size);
3702 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3703 __free_pages(page, order);
3706 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3707 int nelems, enum dma_data_direction dir,
3708 unsigned long attrs)
3710 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3711 unsigned long nrpages = 0;
3712 struct scatterlist *sg;
3715 if (!iommu_need_mapping(dev))
3716 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3718 for_each_sg(sglist, sg, nelems, i) {
3719 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3722 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3724 trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3727 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3728 enum dma_data_direction dir, unsigned long attrs)
3731 struct dmar_domain *domain;
3734 unsigned long iova_pfn;
3736 struct scatterlist *sg;
3737 unsigned long start_vpfn;
3738 struct intel_iommu *iommu;
3740 BUG_ON(dir == DMA_NONE);
3741 if (!iommu_need_mapping(dev))
3742 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3744 domain = find_domain(dev);
3748 iommu = domain_get_iommu(domain);
3750 for_each_sg(sglist, sg, nelems, i)
3751 size += aligned_nrpages(sg->offset, sg->length);
3753 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3756 sglist->dma_length = 0;
3761 * Check if DMAR supports zero-length reads on write only
3764 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3765 !cap_zlr(iommu->cap))
3766 prot |= DMA_PTE_READ;
3767 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3768 prot |= DMA_PTE_WRITE;
3770 start_vpfn = mm_to_dma_pfn(iova_pfn);
3772 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3773 if (unlikely(ret)) {
3774 dma_pte_free_pagetable(domain, start_vpfn,
3775 start_vpfn + size - 1,
3776 agaw_to_level(domain->agaw) + 1);
3777 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3781 trace_map_sg(dev, iova_pfn << PAGE_SHIFT,
3782 sg_phys(sglist), size << VTD_PAGE_SHIFT);
3787 static u64 intel_get_required_mask(struct device *dev)
3789 if (!iommu_need_mapping(dev))
3790 return dma_direct_get_required_mask(dev);
3791 return DMA_BIT_MASK(32);
3794 static const struct dma_map_ops intel_dma_ops = {
3795 .alloc = intel_alloc_coherent,
3796 .free = intel_free_coherent,
3797 .map_sg = intel_map_sg,
3798 .unmap_sg = intel_unmap_sg,
3799 .map_page = intel_map_page,
3800 .unmap_page = intel_unmap_page,
3801 .map_resource = intel_map_resource,
3802 .unmap_resource = intel_unmap_resource,
3803 .dma_supported = dma_direct_supported,
3804 .mmap = dma_common_mmap,
3805 .get_sgtable = dma_common_get_sgtable,
3806 .get_required_mask = intel_get_required_mask,
3810 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3811 enum dma_data_direction dir, enum dma_sync_target target)
3813 struct dmar_domain *domain;
3814 phys_addr_t tlb_addr;
3816 domain = find_domain(dev);
3817 if (WARN_ON(!domain))
3820 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3821 if (is_swiotlb_buffer(tlb_addr))
3822 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3826 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3827 enum dma_data_direction dir, unsigned long attrs,
3830 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3831 struct dmar_domain *domain;
3832 struct intel_iommu *iommu;
3833 unsigned long iova_pfn;
3834 unsigned long nrpages;
3835 phys_addr_t tlb_addr;
3839 domain = find_domain(dev);
3840 if (WARN_ON(dir == DMA_NONE || !domain))
3841 return DMA_MAPPING_ERROR;
3843 iommu = domain_get_iommu(domain);
3844 if (WARN_ON(!iommu))
3845 return DMA_MAPPING_ERROR;
3847 nrpages = aligned_nrpages(0, size);
3848 iova_pfn = intel_alloc_iova(dev, domain,
3849 dma_to_mm_pfn(nrpages), dma_mask);
3851 return DMA_MAPPING_ERROR;
3854 * Check if DMAR supports zero-length reads on write only
3857 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3858 !cap_zlr(iommu->cap))
3859 prot |= DMA_PTE_READ;
3860 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3861 prot |= DMA_PTE_WRITE;
3864 * If both the physical buffer start address and size are
3865 * page aligned, we don't need to use a bounce page.
3867 if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3868 tlb_addr = swiotlb_tbl_map_single(dev,
3869 __phys_to_dma(dev, io_tlb_start),
3870 paddr, size, aligned_size, dir, attrs);
3871 if (tlb_addr == DMA_MAPPING_ERROR) {
3874 /* Cleanup the padding area. */
3875 void *padding_start = phys_to_virt(tlb_addr);
3876 size_t padding_size = aligned_size;
3878 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3879 (dir == DMA_TO_DEVICE ||
3880 dir == DMA_BIDIRECTIONAL)) {
3881 padding_start += size;
3882 padding_size -= size;
3885 memset(padding_start, 0, padding_size);
3891 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3892 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3896 trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3898 return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3901 if (is_swiotlb_buffer(tlb_addr))
3902 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3903 aligned_size, dir, attrs);
3905 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3906 dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3907 size, (unsigned long long)paddr, dir);
3909 return DMA_MAPPING_ERROR;
3913 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3914 enum dma_data_direction dir, unsigned long attrs)
3916 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3917 struct dmar_domain *domain;
3918 phys_addr_t tlb_addr;
3920 domain = find_domain(dev);
3921 if (WARN_ON(!domain))
3924 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3925 if (WARN_ON(!tlb_addr))
3928 intel_unmap(dev, dev_addr, size);
3929 if (is_swiotlb_buffer(tlb_addr))
3930 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3931 aligned_size, dir, attrs);
3933 trace_bounce_unmap_single(dev, dev_addr, size);
3937 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3938 size_t size, enum dma_data_direction dir, unsigned long attrs)
3940 return bounce_map_single(dev, page_to_phys(page) + offset,
3941 size, dir, attrs, *dev->dma_mask);
3945 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3946 enum dma_data_direction dir, unsigned long attrs)
3948 return bounce_map_single(dev, phys_addr, size,
3949 dir, attrs, *dev->dma_mask);
3953 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3954 enum dma_data_direction dir, unsigned long attrs)
3956 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3960 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3961 enum dma_data_direction dir, unsigned long attrs)
3963 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3967 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3968 enum dma_data_direction dir, unsigned long attrs)
3970 struct scatterlist *sg;
3973 for_each_sg(sglist, sg, nelems, i)
3974 bounce_unmap_page(dev, sg->dma_address,
3975 sg_dma_len(sg), dir, attrs);
3979 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3980 enum dma_data_direction dir, unsigned long attrs)
3983 struct scatterlist *sg;
3985 for_each_sg(sglist, sg, nelems, i) {
3986 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3987 sg->offset, sg->length,
3989 if (sg->dma_address == DMA_MAPPING_ERROR)
3991 sg_dma_len(sg) = sg->length;
3997 bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
4002 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
4003 size_t size, enum dma_data_direction dir)
4005 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
4009 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4010 size_t size, enum dma_data_direction dir)
4012 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4016 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4017 int nelems, enum dma_data_direction dir)
4019 struct scatterlist *sg;
4022 for_each_sg(sglist, sg, nelems, i)
4023 bounce_sync_single(dev, sg_dma_address(sg),
4024 sg_dma_len(sg), dir, SYNC_FOR_CPU);
4028 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4029 int nelems, enum dma_data_direction dir)
4031 struct scatterlist *sg;
4034 for_each_sg(sglist, sg, nelems, i)
4035 bounce_sync_single(dev, sg_dma_address(sg),
4036 sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4039 static const struct dma_map_ops bounce_dma_ops = {
4040 .alloc = intel_alloc_coherent,
4041 .free = intel_free_coherent,
4042 .map_sg = bounce_map_sg,
4043 .unmap_sg = bounce_unmap_sg,
4044 .map_page = bounce_map_page,
4045 .unmap_page = bounce_unmap_page,
4046 .sync_single_for_cpu = bounce_sync_single_for_cpu,
4047 .sync_single_for_device = bounce_sync_single_for_device,
4048 .sync_sg_for_cpu = bounce_sync_sg_for_cpu,
4049 .sync_sg_for_device = bounce_sync_sg_for_device,
4050 .map_resource = bounce_map_resource,
4051 .unmap_resource = bounce_unmap_resource,
4052 .dma_supported = dma_direct_supported,
4055 static inline int iommu_domain_cache_init(void)
4059 iommu_domain_cache = kmem_cache_create("iommu_domain",
4060 sizeof(struct dmar_domain),
4065 if (!iommu_domain_cache) {
4066 pr_err("Couldn't create iommu_domain cache\n");
4073 static inline int iommu_devinfo_cache_init(void)
4077 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4078 sizeof(struct device_domain_info),
4082 if (!iommu_devinfo_cache) {
4083 pr_err("Couldn't create devinfo cache\n");
4090 static int __init iommu_init_mempool(void)
4093 ret = iova_cache_get();
4097 ret = iommu_domain_cache_init();
4101 ret = iommu_devinfo_cache_init();
4105 kmem_cache_destroy(iommu_domain_cache);
4112 static void __init iommu_exit_mempool(void)
4114 kmem_cache_destroy(iommu_devinfo_cache);
4115 kmem_cache_destroy(iommu_domain_cache);
4119 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4121 struct dmar_drhd_unit *drhd;
4125 /* We know that this device on this chipset has its own IOMMU.
4126 * If we find it under a different IOMMU, then the BIOS is lying
4127 * to us. Hope that the IOMMU for this device is actually
4128 * disabled, and it needs no translation...
4130 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4132 /* "can't" happen */
4133 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4136 vtbar &= 0xffff0000;
4138 /* we know that the this iommu should be at offset 0xa000 from vtbar */
4139 drhd = dmar_find_matched_drhd_unit(pdev);
4140 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
4141 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
4142 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4143 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4146 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4148 static void __init init_no_remapping_devices(void)
4150 struct dmar_drhd_unit *drhd;
4154 for_each_drhd_unit(drhd) {
4155 if (!drhd->include_all) {
4156 for_each_active_dev_scope(drhd->devices,
4157 drhd->devices_cnt, i, dev)
4159 /* ignore DMAR unit if no devices exist */
4160 if (i == drhd->devices_cnt)
4165 for_each_active_drhd_unit(drhd) {
4166 if (drhd->include_all)
4169 for_each_active_dev_scope(drhd->devices,
4170 drhd->devices_cnt, i, dev)
4171 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4173 if (i < drhd->devices_cnt)
4176 /* This IOMMU has *only* gfx devices. Either bypass it or
4177 set the gfx_mapped flag, as appropriate */
4178 if (!dmar_map_gfx) {
4180 for_each_active_dev_scope(drhd->devices,
4181 drhd->devices_cnt, i, dev)
4182 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4187 #ifdef CONFIG_SUSPEND
4188 static int init_iommu_hw(void)
4190 struct dmar_drhd_unit *drhd;
4191 struct intel_iommu *iommu = NULL;
4193 for_each_active_iommu(iommu, drhd)
4195 dmar_reenable_qi(iommu);
4197 for_each_iommu(iommu, drhd) {
4198 if (drhd->ignored) {
4200 * we always have to disable PMRs or DMA may fail on
4204 iommu_disable_protect_mem_regions(iommu);
4208 iommu_flush_write_buffer(iommu);
4210 iommu_set_root_entry(iommu);
4212 iommu->flush.flush_context(iommu, 0, 0, 0,
4213 DMA_CCMD_GLOBAL_INVL);
4214 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4215 iommu_enable_translation(iommu);
4216 iommu_disable_protect_mem_regions(iommu);
4222 static void iommu_flush_all(void)
4224 struct dmar_drhd_unit *drhd;
4225 struct intel_iommu *iommu;
4227 for_each_active_iommu(iommu, drhd) {
4228 iommu->flush.flush_context(iommu, 0, 0, 0,
4229 DMA_CCMD_GLOBAL_INVL);
4230 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4231 DMA_TLB_GLOBAL_FLUSH);
4235 static int iommu_suspend(void)
4237 struct dmar_drhd_unit *drhd;
4238 struct intel_iommu *iommu = NULL;
4241 for_each_active_iommu(iommu, drhd) {
4242 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4244 if (!iommu->iommu_state)
4250 for_each_active_iommu(iommu, drhd) {
4251 iommu_disable_translation(iommu);
4253 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4255 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4256 readl(iommu->reg + DMAR_FECTL_REG);
4257 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4258 readl(iommu->reg + DMAR_FEDATA_REG);
4259 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4260 readl(iommu->reg + DMAR_FEADDR_REG);
4261 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4262 readl(iommu->reg + DMAR_FEUADDR_REG);
4264 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4269 for_each_active_iommu(iommu, drhd)
4270 kfree(iommu->iommu_state);
4275 static void iommu_resume(void)
4277 struct dmar_drhd_unit *drhd;
4278 struct intel_iommu *iommu = NULL;
4281 if (init_iommu_hw()) {
4283 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4285 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4289 for_each_active_iommu(iommu, drhd) {
4291 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4293 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4294 iommu->reg + DMAR_FECTL_REG);
4295 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4296 iommu->reg + DMAR_FEDATA_REG);
4297 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4298 iommu->reg + DMAR_FEADDR_REG);
4299 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4300 iommu->reg + DMAR_FEUADDR_REG);
4302 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4305 for_each_active_iommu(iommu, drhd)
4306 kfree(iommu->iommu_state);
4309 static struct syscore_ops iommu_syscore_ops = {
4310 .resume = iommu_resume,
4311 .suspend = iommu_suspend,
4314 static void __init init_iommu_pm_ops(void)
4316 register_syscore_ops(&iommu_syscore_ops);
4320 static inline void init_iommu_pm_ops(void) {}
4321 #endif /* CONFIG_PM */
4323 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4325 struct acpi_dmar_reserved_memory *rmrr;
4326 struct dmar_rmrr_unit *rmrru;
4328 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4332 rmrru->hdr = header;
4333 rmrr = (struct acpi_dmar_reserved_memory *)header;
4334 rmrru->base_address = rmrr->base_address;
4335 rmrru->end_address = rmrr->end_address;
4337 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4338 ((void *)rmrr) + rmrr->header.length,
4339 &rmrru->devices_cnt);
4340 if (rmrru->devices_cnt && rmrru->devices == NULL)
4343 list_add(&rmrru->list, &dmar_rmrr_units);
4352 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4354 struct dmar_atsr_unit *atsru;
4355 struct acpi_dmar_atsr *tmp;
4357 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4359 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4360 if (atsr->segment != tmp->segment)
4362 if (atsr->header.length != tmp->header.length)
4364 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4371 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4373 struct acpi_dmar_atsr *atsr;
4374 struct dmar_atsr_unit *atsru;
4376 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4379 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4380 atsru = dmar_find_atsr(atsr);
4384 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4389 * If memory is allocated from slab by ACPI _DSM method, we need to
4390 * copy the memory content because the memory buffer will be freed
4393 atsru->hdr = (void *)(atsru + 1);
4394 memcpy(atsru->hdr, hdr, hdr->length);
4395 atsru->include_all = atsr->flags & 0x1;
4396 if (!atsru->include_all) {
4397 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4398 (void *)atsr + atsr->header.length,
4399 &atsru->devices_cnt);
4400 if (atsru->devices_cnt && atsru->devices == NULL) {
4406 list_add_rcu(&atsru->list, &dmar_atsr_units);
4411 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4413 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4417 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4419 struct acpi_dmar_atsr *atsr;
4420 struct dmar_atsr_unit *atsru;
4422 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4423 atsru = dmar_find_atsr(atsr);
4425 list_del_rcu(&atsru->list);
4427 intel_iommu_free_atsr(atsru);
4433 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4437 struct acpi_dmar_atsr *atsr;
4438 struct dmar_atsr_unit *atsru;
4440 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4441 atsru = dmar_find_atsr(atsr);
4445 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4446 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4454 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4457 struct intel_iommu *iommu = dmaru->iommu;
4459 if (g_iommus[iommu->seq_id])
4462 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4463 pr_warn("%s: Doesn't support hardware pass through.\n",
4467 if (!ecap_sc_support(iommu->ecap) &&
4468 domain_update_iommu_snooping(iommu)) {
4469 pr_warn("%s: Doesn't support snooping.\n",
4473 sp = domain_update_iommu_superpage(iommu) - 1;
4474 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4475 pr_warn("%s: Doesn't support large page.\n",
4481 * Disable translation if already enabled prior to OS handover.
4483 if (iommu->gcmd & DMA_GCMD_TE)
4484 iommu_disable_translation(iommu);
4486 g_iommus[iommu->seq_id] = iommu;
4487 ret = iommu_init_domains(iommu);
4489 ret = iommu_alloc_root_entry(iommu);
4493 #ifdef CONFIG_INTEL_IOMMU_SVM
4494 if (pasid_supported(iommu))
4495 intel_svm_init(iommu);
4498 if (dmaru->ignored) {
4500 * we always have to disable PMRs or DMA may fail on this device
4503 iommu_disable_protect_mem_regions(iommu);
4507 intel_iommu_init_qi(iommu);
4508 iommu_flush_write_buffer(iommu);
4510 #ifdef CONFIG_INTEL_IOMMU_SVM
4511 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4512 ret = intel_svm_enable_prq(iommu);
4517 ret = dmar_set_interrupt(iommu);
4521 iommu_set_root_entry(iommu);
4522 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4523 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4524 iommu_enable_translation(iommu);
4526 iommu_disable_protect_mem_regions(iommu);
4530 disable_dmar_iommu(iommu);
4532 free_dmar_iommu(iommu);
4536 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4539 struct intel_iommu *iommu = dmaru->iommu;
4541 if (!intel_iommu_enabled)
4547 ret = intel_iommu_add(dmaru);
4549 disable_dmar_iommu(iommu);
4550 free_dmar_iommu(iommu);
4556 static void intel_iommu_free_dmars(void)
4558 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4559 struct dmar_atsr_unit *atsru, *atsr_n;
4561 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4562 list_del(&rmrru->list);
4563 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4567 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4568 list_del(&atsru->list);
4569 intel_iommu_free_atsr(atsru);
4573 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4576 struct pci_bus *bus;
4577 struct pci_dev *bridge = NULL;
4579 struct acpi_dmar_atsr *atsr;
4580 struct dmar_atsr_unit *atsru;
4582 dev = pci_physfn(dev);
4583 for (bus = dev->bus; bus; bus = bus->parent) {
4585 /* If it's an integrated device, allow ATS */
4588 /* Connected via non-PCIe: no ATS */
4589 if (!pci_is_pcie(bridge) ||
4590 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4592 /* If we found the root port, look it up in the ATSR */
4593 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4598 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4599 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4600 if (atsr->segment != pci_domain_nr(dev->bus))
4603 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4604 if (tmp == &bridge->dev)
4607 if (atsru->include_all)
4617 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4620 struct dmar_rmrr_unit *rmrru;
4621 struct dmar_atsr_unit *atsru;
4622 struct acpi_dmar_atsr *atsr;
4623 struct acpi_dmar_reserved_memory *rmrr;
4625 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4628 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4629 rmrr = container_of(rmrru->hdr,
4630 struct acpi_dmar_reserved_memory, header);
4631 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4632 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4633 ((void *)rmrr) + rmrr->header.length,
4634 rmrr->segment, rmrru->devices,
4635 rmrru->devices_cnt);
4638 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4639 dmar_remove_dev_scope(info, rmrr->segment,
4640 rmrru->devices, rmrru->devices_cnt);
4644 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4645 if (atsru->include_all)
4648 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4649 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4650 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4651 (void *)atsr + atsr->header.length,
4652 atsr->segment, atsru->devices,
4653 atsru->devices_cnt);
4658 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4659 if (dmar_remove_dev_scope(info, atsr->segment,
4660 atsru->devices, atsru->devices_cnt))
4668 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4669 unsigned long val, void *v)
4671 struct memory_notify *mhp = v;
4672 unsigned long long start, end;
4673 unsigned long start_vpfn, last_vpfn;
4676 case MEM_GOING_ONLINE:
4677 start = mhp->start_pfn << PAGE_SHIFT;
4678 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4679 if (iommu_domain_identity_map(si_domain, start, end)) {
4680 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4687 case MEM_CANCEL_ONLINE:
4688 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4689 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4690 while (start_vpfn <= last_vpfn) {
4692 struct dmar_drhd_unit *drhd;
4693 struct intel_iommu *iommu;
4694 struct page *freelist;
4696 iova = find_iova(&si_domain->iovad, start_vpfn);
4698 pr_debug("Failed get IOVA for PFN %lx\n",
4703 iova = split_and_remove_iova(&si_domain->iovad, iova,
4704 start_vpfn, last_vpfn);
4706 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4707 start_vpfn, last_vpfn);
4711 freelist = domain_unmap(si_domain, iova->pfn_lo,
4715 for_each_active_iommu(iommu, drhd)
4716 iommu_flush_iotlb_psi(iommu, si_domain,
4717 iova->pfn_lo, iova_size(iova),
4720 dma_free_pagelist(freelist);
4722 start_vpfn = iova->pfn_hi + 1;
4723 free_iova_mem(iova);
4731 static struct notifier_block intel_iommu_memory_nb = {
4732 .notifier_call = intel_iommu_memory_notifier,
4736 static void free_all_cpu_cached_iovas(unsigned int cpu)
4740 for (i = 0; i < g_num_of_iommus; i++) {
4741 struct intel_iommu *iommu = g_iommus[i];
4742 struct dmar_domain *domain;
4748 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4749 domain = get_iommu_domain(iommu, (u16)did);
4753 free_cpu_cached_iovas(cpu, &domain->iovad);
4758 static int intel_iommu_cpu_dead(unsigned int cpu)
4760 free_all_cpu_cached_iovas(cpu);
4764 static void intel_disable_iommus(void)
4766 struct intel_iommu *iommu = NULL;
4767 struct dmar_drhd_unit *drhd;
4769 for_each_iommu(iommu, drhd)
4770 iommu_disable_translation(iommu);
4773 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4775 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4777 return container_of(iommu_dev, struct intel_iommu, iommu);
4780 static ssize_t intel_iommu_show_version(struct device *dev,
4781 struct device_attribute *attr,
4784 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4785 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4786 return sprintf(buf, "%d:%d\n",
4787 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4789 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4791 static ssize_t intel_iommu_show_address(struct device *dev,
4792 struct device_attribute *attr,
4795 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4796 return sprintf(buf, "%llx\n", iommu->reg_phys);
4798 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4800 static ssize_t intel_iommu_show_cap(struct device *dev,
4801 struct device_attribute *attr,
4804 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4805 return sprintf(buf, "%llx\n", iommu->cap);
4807 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4809 static ssize_t intel_iommu_show_ecap(struct device *dev,
4810 struct device_attribute *attr,
4813 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4814 return sprintf(buf, "%llx\n", iommu->ecap);
4816 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4818 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4819 struct device_attribute *attr,
4822 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4823 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4825 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4827 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4828 struct device_attribute *attr,
4831 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4832 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4833 cap_ndoms(iommu->cap)));
4835 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4837 static struct attribute *intel_iommu_attrs[] = {
4838 &dev_attr_version.attr,
4839 &dev_attr_address.attr,
4841 &dev_attr_ecap.attr,
4842 &dev_attr_domains_supported.attr,
4843 &dev_attr_domains_used.attr,
4847 static struct attribute_group intel_iommu_group = {
4848 .name = "intel-iommu",
4849 .attrs = intel_iommu_attrs,
4852 const struct attribute_group *intel_iommu_groups[] = {
4857 static inline bool has_untrusted_dev(void)
4859 struct pci_dev *pdev = NULL;
4861 for_each_pci_dev(pdev)
4862 if (pdev->untrusted)
4868 static int __init platform_optin_force_iommu(void)
4870 if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
4873 if (no_iommu || dmar_disabled)
4874 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4877 * If Intel-IOMMU is disabled by default, we will apply identity
4878 * map for all devices except those marked as being untrusted.
4881 iommu_identity_mapping |= IDENTMAP_ALL;
4889 static int __init probe_acpi_namespace_devices(void)
4891 struct dmar_drhd_unit *drhd;
4892 /* To avoid a -Wunused-but-set-variable warning. */
4893 struct intel_iommu *iommu __maybe_unused;
4897 for_each_active_iommu(iommu, drhd) {
4898 for_each_active_dev_scope(drhd->devices,
4899 drhd->devices_cnt, i, dev) {
4900 struct acpi_device_physical_node *pn;
4901 struct iommu_group *group;
4902 struct acpi_device *adev;
4904 if (dev->bus != &acpi_bus_type)
4907 adev = to_acpi_device(dev);
4908 mutex_lock(&adev->physical_node_lock);
4909 list_for_each_entry(pn,
4910 &adev->physical_node_list, node) {
4911 group = iommu_group_get(pn->dev);
4913 iommu_group_put(group);
4917 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4918 ret = iommu_probe_device(pn->dev);
4922 mutex_unlock(&adev->physical_node_lock);
4932 int __init intel_iommu_init(void)
4935 struct dmar_drhd_unit *drhd;
4936 struct intel_iommu *iommu;
4939 * Intel IOMMU is required for a TXT/tboot launch or platform
4940 * opt in, so enforce that.
4942 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4943 platform_optin_force_iommu();
4945 if (iommu_init_mempool()) {
4947 panic("tboot: Failed to initialize iommu memory\n");
4951 down_write(&dmar_global_lock);
4952 if (dmar_table_init()) {
4954 panic("tboot: Failed to initialize DMAR table\n");
4958 if (dmar_dev_scope_init() < 0) {
4960 panic("tboot: Failed to initialize DMAR device scope\n");
4964 up_write(&dmar_global_lock);
4967 * The bus notifier takes the dmar_global_lock, so lockdep will
4968 * complain later when we register it under the lock.
4970 dmar_register_bus_notifier();
4972 down_write(&dmar_global_lock);
4975 intel_iommu_debugfs_init();
4977 if (no_iommu || dmar_disabled) {
4979 * We exit the function here to ensure IOMMU's remapping and
4980 * mempool aren't setup, which means that the IOMMU's PMRs
4981 * won't be disabled via the call to init_dmars(). So disable
4982 * it explicitly here. The PMRs were setup by tboot prior to
4983 * calling SENTER, but the kernel is expected to reset/tear
4986 if (intel_iommu_tboot_noforce) {
4987 for_each_iommu(iommu, drhd)
4988 iommu_disable_protect_mem_regions(iommu);
4992 * Make sure the IOMMUs are switched off, even when we
4993 * boot into a kexec kernel and the previous kernel left
4996 intel_disable_iommus();
5000 if (list_empty(&dmar_rmrr_units))
5001 pr_info("No RMRR found\n");
5003 if (list_empty(&dmar_atsr_units))
5004 pr_info("No ATSR found\n");
5006 if (dmar_init_reserved_ranges()) {
5008 panic("tboot: Failed to reserve iommu ranges\n");
5009 goto out_free_reserved_range;
5013 intel_iommu_gfx_mapped = 1;
5015 init_no_remapping_devices();
5020 panic("tboot: Failed to initialize DMARs\n");
5021 pr_err("Initialization failed\n");
5022 goto out_free_reserved_range;
5024 up_write(&dmar_global_lock);
5026 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
5028 * If the system has no untrusted device or the user has decided
5029 * to disable the bounce page mechanisms, we don't need swiotlb.
5030 * Mark this and the pre-allocated bounce pages will be released
5033 if (!has_untrusted_dev() || intel_no_bounce)
5036 dma_ops = &intel_dma_ops;
5038 init_iommu_pm_ops();
5040 down_read(&dmar_global_lock);
5041 for_each_active_iommu(iommu, drhd) {
5042 iommu_device_sysfs_add(&iommu->iommu, NULL,
5045 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5046 iommu_device_register(&iommu->iommu);
5048 up_read(&dmar_global_lock);
5050 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5051 if (si_domain && !hw_pass_through)
5052 register_memory_notifier(&intel_iommu_memory_nb);
5053 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5054 intel_iommu_cpu_dead);
5056 down_read(&dmar_global_lock);
5057 if (probe_acpi_namespace_devices())
5058 pr_warn("ACPI name space devices didn't probe correctly\n");
5060 /* Finally, we enable the DMA remapping hardware. */
5061 for_each_iommu(iommu, drhd) {
5062 if (!drhd->ignored && !translation_pre_enabled(iommu))
5063 iommu_enable_translation(iommu);
5065 iommu_disable_protect_mem_regions(iommu);
5067 up_read(&dmar_global_lock);
5069 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5071 intel_iommu_enabled = 1;
5075 out_free_reserved_range:
5076 put_iova_domain(&reserved_iova_list);
5078 intel_iommu_free_dmars();
5079 up_write(&dmar_global_lock);
5080 iommu_exit_mempool();
5084 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5086 struct intel_iommu *iommu = opaque;
5088 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5093 * NB - intel-iommu lacks any sort of reference counting for the users of
5094 * dependent devices. If multiple endpoints have intersecting dependent
5095 * devices, unbinding the driver from any one of them will possibly leave
5096 * the others unable to operate.
5098 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5100 if (!iommu || !dev || !dev_is_pci(dev))
5103 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5106 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5108 struct dmar_domain *domain;
5109 struct intel_iommu *iommu;
5110 unsigned long flags;
5112 assert_spin_locked(&device_domain_lock);
5117 iommu = info->iommu;
5118 domain = info->domain;
5121 if (dev_is_pci(info->dev) && sm_supported(iommu))
5122 intel_pasid_tear_down_entry(iommu, info->dev,
5125 iommu_disable_dev_iotlb(info);
5126 domain_context_clear(iommu, info->dev);
5127 intel_pasid_free_table(info->dev);
5130 unlink_domain_info(info);
5132 spin_lock_irqsave(&iommu->lock, flags);
5133 domain_detach_iommu(domain, iommu);
5134 spin_unlock_irqrestore(&iommu->lock, flags);
5136 /* free the private domain */
5137 if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
5138 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
5139 list_empty(&domain->devices))
5140 domain_exit(info->domain);
5142 free_devinfo_mem(info);
5145 static void dmar_remove_one_dev_info(struct device *dev)
5147 struct device_domain_info *info;
5148 unsigned long flags;
5150 spin_lock_irqsave(&device_domain_lock, flags);
5151 info = dev->archdata.iommu;
5152 if (info && info != DEFER_DEVICE_DOMAIN_INFO
5153 && info != DUMMY_DEVICE_DOMAIN_INFO)
5154 __dmar_remove_one_dev_info(info);
5155 spin_unlock_irqrestore(&device_domain_lock, flags);
5158 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5162 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5163 domain_reserve_special_ranges(domain);
5165 /* calculate AGAW */
5166 domain->gaw = guest_width;
5167 adjust_width = guestwidth_to_adjustwidth(guest_width);
5168 domain->agaw = width_to_agaw(adjust_width);
5170 domain->iommu_coherency = 0;
5171 domain->iommu_snooping = 0;
5172 domain->iommu_superpage = 0;
5173 domain->max_addr = 0;
5175 /* always allocate the top pgd */
5176 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5179 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5183 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5185 struct dmar_domain *dmar_domain;
5186 struct iommu_domain *domain;
5189 case IOMMU_DOMAIN_DMA:
5191 case IOMMU_DOMAIN_UNMANAGED:
5192 dmar_domain = alloc_domain(0);
5194 pr_err("Can't allocate dmar_domain\n");
5197 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5198 pr_err("Domain initialization failed\n");
5199 domain_exit(dmar_domain);
5203 if (type == IOMMU_DOMAIN_DMA &&
5204 init_iova_flush_queue(&dmar_domain->iovad,
5205 iommu_flush_iova, iova_entry_free)) {
5206 pr_warn("iova flush queue initialization failed\n");
5207 intel_iommu_strict = 1;
5210 domain_update_iommu_cap(dmar_domain);
5212 domain = &dmar_domain->domain;
5213 domain->geometry.aperture_start = 0;
5214 domain->geometry.aperture_end =
5215 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5216 domain->geometry.force_aperture = true;
5219 case IOMMU_DOMAIN_IDENTITY:
5220 return &si_domain->domain;
5228 static void intel_iommu_domain_free(struct iommu_domain *domain)
5230 if (domain != &si_domain->domain)
5231 domain_exit(to_dmar_domain(domain));
5235 * Check whether a @domain could be attached to the @dev through the
5236 * aux-domain attach/detach APIs.
5239 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5241 struct device_domain_info *info = dev->archdata.iommu;
5243 return info && info->auxd_enabled &&
5244 domain->type == IOMMU_DOMAIN_UNMANAGED;
5247 static void auxiliary_link_device(struct dmar_domain *domain,
5250 struct device_domain_info *info = dev->archdata.iommu;
5252 assert_spin_locked(&device_domain_lock);
5256 domain->auxd_refcnt++;
5257 list_add(&domain->auxd, &info->auxiliary_domains);
5260 static void auxiliary_unlink_device(struct dmar_domain *domain,
5263 struct device_domain_info *info = dev->archdata.iommu;
5265 assert_spin_locked(&device_domain_lock);
5269 list_del(&domain->auxd);
5270 domain->auxd_refcnt--;
5272 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5273 intel_pasid_free_id(domain->default_pasid);
5276 static int aux_domain_add_dev(struct dmar_domain *domain,
5281 unsigned long flags;
5282 struct intel_iommu *iommu;
5284 iommu = device_to_iommu(dev, &bus, &devfn);
5288 if (domain->default_pasid <= 0) {
5291 pasid = intel_pasid_alloc_id(domain, PASID_MIN,
5292 pci_max_pasids(to_pci_dev(dev)),
5295 pr_err("Can't allocate default pasid\n");
5298 domain->default_pasid = pasid;
5301 spin_lock_irqsave(&device_domain_lock, flags);
5303 * iommu->lock must be held to attach domain to iommu and setup the
5304 * pasid entry for second level translation.
5306 spin_lock(&iommu->lock);
5307 ret = domain_attach_iommu(domain, iommu);
5311 /* Setup the PASID entry for mediated devices: */
5312 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5313 domain->default_pasid);
5316 spin_unlock(&iommu->lock);
5318 auxiliary_link_device(domain, dev);
5320 spin_unlock_irqrestore(&device_domain_lock, flags);
5325 domain_detach_iommu(domain, iommu);
5327 spin_unlock(&iommu->lock);
5328 spin_unlock_irqrestore(&device_domain_lock, flags);
5329 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5330 intel_pasid_free_id(domain->default_pasid);
5335 static void aux_domain_remove_dev(struct dmar_domain *domain,
5338 struct device_domain_info *info;
5339 struct intel_iommu *iommu;
5340 unsigned long flags;
5342 if (!is_aux_domain(dev, &domain->domain))
5345 spin_lock_irqsave(&device_domain_lock, flags);
5346 info = dev->archdata.iommu;
5347 iommu = info->iommu;
5349 auxiliary_unlink_device(domain, dev);
5351 spin_lock(&iommu->lock);
5352 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5353 domain_detach_iommu(domain, iommu);
5354 spin_unlock(&iommu->lock);
5356 spin_unlock_irqrestore(&device_domain_lock, flags);
5359 static int prepare_domain_attach_device(struct iommu_domain *domain,
5362 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5363 struct intel_iommu *iommu;
5367 iommu = device_to_iommu(dev, &bus, &devfn);
5371 /* check if this iommu agaw is sufficient for max mapped address */
5372 addr_width = agaw_to_width(iommu->agaw);
5373 if (addr_width > cap_mgaw(iommu->cap))
5374 addr_width = cap_mgaw(iommu->cap);
5376 if (dmar_domain->max_addr > (1LL << addr_width)) {
5377 dev_err(dev, "%s: iommu width (%d) is not "
5378 "sufficient for the mapped address (%llx)\n",
5379 __func__, addr_width, dmar_domain->max_addr);
5382 dmar_domain->gaw = addr_width;
5385 * Knock out extra levels of page tables if necessary
5387 while (iommu->agaw < dmar_domain->agaw) {
5388 struct dma_pte *pte;
5390 pte = dmar_domain->pgd;
5391 if (dma_pte_present(pte)) {
5392 dmar_domain->pgd = (struct dma_pte *)
5393 phys_to_virt(dma_pte_addr(pte));
5394 free_pgtable_page(pte);
5396 dmar_domain->agaw--;
5402 static int intel_iommu_attach_device(struct iommu_domain *domain,
5407 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5408 device_is_rmrr_locked(dev)) {
5409 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5413 if (is_aux_domain(dev, domain))
5416 /* normally dev is not mapped */
5417 if (unlikely(domain_context_mapped(dev))) {
5418 struct dmar_domain *old_domain;
5420 old_domain = find_domain(dev);
5422 dmar_remove_one_dev_info(dev);
5425 ret = prepare_domain_attach_device(domain, dev);
5429 return domain_add_dev_info(to_dmar_domain(domain), dev);
5432 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5437 if (!is_aux_domain(dev, domain))
5440 ret = prepare_domain_attach_device(domain, dev);
5444 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5447 static void intel_iommu_detach_device(struct iommu_domain *domain,
5450 dmar_remove_one_dev_info(dev);
5453 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5456 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5459 static int intel_iommu_map(struct iommu_domain *domain,
5460 unsigned long iova, phys_addr_t hpa,
5461 size_t size, int iommu_prot, gfp_t gfp)
5463 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5468 if (iommu_prot & IOMMU_READ)
5469 prot |= DMA_PTE_READ;
5470 if (iommu_prot & IOMMU_WRITE)
5471 prot |= DMA_PTE_WRITE;
5472 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5473 prot |= DMA_PTE_SNP;
5475 max_addr = iova + size;
5476 if (dmar_domain->max_addr < max_addr) {
5479 /* check if minimum agaw is sufficient for mapped address */
5480 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5481 if (end < max_addr) {
5482 pr_err("%s: iommu width (%d) is not "
5483 "sufficient for the mapped address (%llx)\n",
5484 __func__, dmar_domain->gaw, max_addr);
5487 dmar_domain->max_addr = max_addr;
5489 /* Round up size to next multiple of PAGE_SIZE, if it and
5490 the low bits of hpa would take us onto the next page */
5491 size = aligned_nrpages(hpa, size);
5492 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5493 hpa >> VTD_PAGE_SHIFT, size, prot);
5497 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5498 unsigned long iova, size_t size,
5499 struct iommu_iotlb_gather *gather)
5501 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5502 struct page *freelist = NULL;
5503 unsigned long start_pfn, last_pfn;
5504 unsigned int npages;
5505 int iommu_id, level = 0;
5507 /* Cope with horrid API which requires us to unmap more than the
5508 size argument if it happens to be a large-page mapping. */
5509 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5511 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5512 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5514 start_pfn = iova >> VTD_PAGE_SHIFT;
5515 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5517 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5519 npages = last_pfn - start_pfn + 1;
5521 for_each_domain_iommu(iommu_id, dmar_domain)
5522 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5523 start_pfn, npages, !freelist, 0);
5525 dma_free_pagelist(freelist);
5527 if (dmar_domain->max_addr == iova + size)
5528 dmar_domain->max_addr = iova;
5533 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5536 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5537 struct dma_pte *pte;
5541 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5542 if (pte && dma_pte_present(pte))
5543 phys = dma_pte_addr(pte) +
5544 (iova & (BIT_MASK(level_to_offset_bits(level) +
5545 VTD_PAGE_SHIFT) - 1));
5550 static inline bool scalable_mode_support(void)
5552 struct dmar_drhd_unit *drhd;
5553 struct intel_iommu *iommu;
5557 for_each_active_iommu(iommu, drhd) {
5558 if (!sm_supported(iommu)) {
5568 static inline bool iommu_pasid_support(void)
5570 struct dmar_drhd_unit *drhd;
5571 struct intel_iommu *iommu;
5575 for_each_active_iommu(iommu, drhd) {
5576 if (!pasid_supported(iommu)) {
5586 static bool intel_iommu_capable(enum iommu_cap cap)
5588 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5589 return domain_update_iommu_snooping(NULL) == 1;
5590 if (cap == IOMMU_CAP_INTR_REMAP)
5591 return irq_remapping_enabled == 1;
5596 static int intel_iommu_add_device(struct device *dev)
5598 struct dmar_domain *dmar_domain;
5599 struct iommu_domain *domain;
5600 struct intel_iommu *iommu;
5601 struct iommu_group *group;
5605 iommu = device_to_iommu(dev, &bus, &devfn);
5609 iommu_device_link(&iommu->iommu, dev);
5611 if (translation_pre_enabled(iommu))
5612 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5614 group = iommu_group_get_for_dev(dev);
5616 if (IS_ERR(group)) {
5617 ret = PTR_ERR(group);
5621 iommu_group_put(group);
5623 domain = iommu_get_domain_for_dev(dev);
5624 dmar_domain = to_dmar_domain(domain);
5625 if (domain->type == IOMMU_DOMAIN_DMA) {
5626 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5627 ret = iommu_request_dm_for_dev(dev);
5629 dmar_remove_one_dev_info(dev);
5630 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5631 domain_add_dev_info(si_domain, dev);
5633 "Device uses a private identity domain.\n");
5637 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5638 ret = iommu_request_dma_domain_for_dev(dev);
5640 dmar_remove_one_dev_info(dev);
5641 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5642 if (!get_private_domain_for_dev(dev)) {
5644 "Failed to get a private domain.\n");
5650 "Device uses a private dma domain.\n");
5655 if (device_needs_bounce(dev)) {
5656 dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
5657 set_dma_ops(dev, &bounce_dma_ops);
5663 iommu_device_unlink(&iommu->iommu, dev);
5667 static void intel_iommu_remove_device(struct device *dev)
5669 struct intel_iommu *iommu;
5672 iommu = device_to_iommu(dev, &bus, &devfn);
5676 dmar_remove_one_dev_info(dev);
5678 iommu_group_remove_device(dev);
5680 iommu_device_unlink(&iommu->iommu, dev);
5682 if (device_needs_bounce(dev))
5683 set_dma_ops(dev, NULL);
5686 static void intel_iommu_get_resv_regions(struct device *device,
5687 struct list_head *head)
5689 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5690 struct iommu_resv_region *reg;
5691 struct dmar_rmrr_unit *rmrr;
5692 struct device *i_dev;
5695 down_read(&dmar_global_lock);
5696 for_each_rmrr_units(rmrr) {
5697 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5699 struct iommu_resv_region *resv;
5700 enum iommu_resv_type type;
5703 if (i_dev != device &&
5704 !is_downstream_to_pci_bridge(device, i_dev))
5707 length = rmrr->end_address - rmrr->base_address + 1;
5709 type = device_rmrr_is_relaxable(device) ?
5710 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5712 resv = iommu_alloc_resv_region(rmrr->base_address,
5713 length, prot, type);
5717 list_add_tail(&resv->list, head);
5720 up_read(&dmar_global_lock);
5722 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5723 if (dev_is_pci(device)) {
5724 struct pci_dev *pdev = to_pci_dev(device);
5726 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5727 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5728 IOMMU_RESV_DIRECT_RELAXABLE);
5730 list_add_tail(®->list, head);
5733 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5735 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5736 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5740 list_add_tail(®->list, head);
5743 static void intel_iommu_put_resv_regions(struct device *dev,
5744 struct list_head *head)
5746 struct iommu_resv_region *entry, *next;
5748 list_for_each_entry_safe(entry, next, head, list)
5752 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5754 struct device_domain_info *info;
5755 struct context_entry *context;
5756 struct dmar_domain *domain;
5757 unsigned long flags;
5761 domain = find_domain(dev);
5765 spin_lock_irqsave(&device_domain_lock, flags);
5766 spin_lock(&iommu->lock);
5769 info = dev->archdata.iommu;
5770 if (!info || !info->pasid_supported)
5773 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5774 if (WARN_ON(!context))
5777 ctx_lo = context[0].lo;
5779 if (!(ctx_lo & CONTEXT_PASIDE)) {
5780 ctx_lo |= CONTEXT_PASIDE;
5781 context[0].lo = ctx_lo;
5783 iommu->flush.flush_context(iommu,
5784 domain->iommu_did[iommu->seq_id],
5785 PCI_DEVID(info->bus, info->devfn),
5786 DMA_CCMD_MASK_NOBIT,
5787 DMA_CCMD_DEVICE_INVL);
5790 /* Enable PASID support in the device, if it wasn't already */
5791 if (!info->pasid_enabled)
5792 iommu_enable_dev_iotlb(info);
5797 spin_unlock(&iommu->lock);
5798 spin_unlock_irqrestore(&device_domain_lock, flags);
5803 static void intel_iommu_apply_resv_region(struct device *dev,
5804 struct iommu_domain *domain,
5805 struct iommu_resv_region *region)
5807 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5808 unsigned long start, end;
5810 start = IOVA_PFN(region->start);
5811 end = IOVA_PFN(region->start + region->length - 1);
5813 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5816 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5818 if (dev_is_pci(dev))
5819 return pci_device_group(dev);
5820 return generic_device_group(dev);
5823 #ifdef CONFIG_INTEL_IOMMU_SVM
5824 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5826 struct intel_iommu *iommu;
5829 if (iommu_dummy(dev)) {
5831 "No IOMMU translation for device; cannot enable SVM\n");
5835 iommu = device_to_iommu(dev, &bus, &devfn);
5837 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5843 #endif /* CONFIG_INTEL_IOMMU_SVM */
5845 static int intel_iommu_enable_auxd(struct device *dev)
5847 struct device_domain_info *info;
5848 struct intel_iommu *iommu;
5849 unsigned long flags;
5853 iommu = device_to_iommu(dev, &bus, &devfn);
5854 if (!iommu || dmar_disabled)
5857 if (!sm_supported(iommu) || !pasid_supported(iommu))
5860 ret = intel_iommu_enable_pasid(iommu, dev);
5864 spin_lock_irqsave(&device_domain_lock, flags);
5865 info = dev->archdata.iommu;
5866 info->auxd_enabled = 1;
5867 spin_unlock_irqrestore(&device_domain_lock, flags);
5872 static int intel_iommu_disable_auxd(struct device *dev)
5874 struct device_domain_info *info;
5875 unsigned long flags;
5877 spin_lock_irqsave(&device_domain_lock, flags);
5878 info = dev->archdata.iommu;
5879 if (!WARN_ON(!info))
5880 info->auxd_enabled = 0;
5881 spin_unlock_irqrestore(&device_domain_lock, flags);
5887 * A PCI express designated vendor specific extended capability is defined
5888 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5889 * for system software and tools to detect endpoint devices supporting the
5890 * Intel scalable IO virtualization without host driver dependency.
5892 * Returns the address of the matching extended capability structure within
5893 * the device's PCI configuration space or 0 if the device does not support
5896 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5901 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5903 pci_read_config_word(pdev, pos + 4, &vendor);
5904 pci_read_config_word(pdev, pos + 8, &id);
5905 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5908 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5915 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5917 if (feat == IOMMU_DEV_FEAT_AUX) {
5920 if (!dev_is_pci(dev) || dmar_disabled ||
5921 !scalable_mode_support() || !iommu_pasid_support())
5924 ret = pci_pasid_features(to_pci_dev(dev));
5928 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5935 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5937 if (feat == IOMMU_DEV_FEAT_AUX)
5938 return intel_iommu_enable_auxd(dev);
5944 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5946 if (feat == IOMMU_DEV_FEAT_AUX)
5947 return intel_iommu_disable_auxd(dev);
5953 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5955 struct device_domain_info *info = dev->archdata.iommu;
5957 if (feat == IOMMU_DEV_FEAT_AUX)
5958 return scalable_mode_support() && info && info->auxd_enabled;
5964 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5966 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5968 return dmar_domain->default_pasid > 0 ?
5969 dmar_domain->default_pasid : -EINVAL;
5972 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5975 return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
5979 * Check that the device does not live on an external facing PCI port that is
5980 * marked as untrusted. Such devices should not be able to apply quirks and
5981 * thus not be able to bypass the IOMMU restrictions.
5983 static bool risky_device(struct pci_dev *pdev)
5985 if (pdev->untrusted) {
5987 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
5988 pdev->vendor, pdev->device);
5989 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
5995 const struct iommu_ops intel_iommu_ops = {
5996 .capable = intel_iommu_capable,
5997 .domain_alloc = intel_iommu_domain_alloc,
5998 .domain_free = intel_iommu_domain_free,
5999 .attach_dev = intel_iommu_attach_device,
6000 .detach_dev = intel_iommu_detach_device,
6001 .aux_attach_dev = intel_iommu_aux_attach_device,
6002 .aux_detach_dev = intel_iommu_aux_detach_device,
6003 .aux_get_pasid = intel_iommu_aux_get_pasid,
6004 .map = intel_iommu_map,
6005 .unmap = intel_iommu_unmap,
6006 .iova_to_phys = intel_iommu_iova_to_phys,
6007 .add_device = intel_iommu_add_device,
6008 .remove_device = intel_iommu_remove_device,
6009 .get_resv_regions = intel_iommu_get_resv_regions,
6010 .put_resv_regions = intel_iommu_put_resv_regions,
6011 .apply_resv_region = intel_iommu_apply_resv_region,
6012 .device_group = intel_iommu_device_group,
6013 .dev_has_feat = intel_iommu_dev_has_feat,
6014 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
6015 .dev_enable_feat = intel_iommu_dev_enable_feat,
6016 .dev_disable_feat = intel_iommu_dev_disable_feat,
6017 .is_attach_deferred = intel_iommu_is_attach_deferred,
6018 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
6021 static void quirk_iommu_igfx(struct pci_dev *dev)
6023 if (risky_device(dev))
6026 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6030 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6031 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6032 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6033 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6034 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6035 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6036 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6037 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6039 /* Broadwell igfx malfunctions with dmar */
6040 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6041 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6042 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6043 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6044 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6045 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6046 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6047 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6048 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6049 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6050 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6051 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6052 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6053 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6054 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6055 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6056 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6057 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6058 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6059 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6060 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6061 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6062 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6063 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6065 static void quirk_iommu_rwbf(struct pci_dev *dev)
6067 if (risky_device(dev))
6071 * Mobile 4 Series Chipset neglects to set RWBF capability,
6072 * but needs it. Same seems to hold for the desktop versions.
6074 pci_info(dev, "Forcing write-buffer flush capability\n");
6078 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6079 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6080 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6081 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6082 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6083 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6084 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6087 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
6088 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
6089 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
6090 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
6091 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
6092 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
6093 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
6094 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
6096 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6100 if (risky_device(dev))
6103 if (pci_read_config_word(dev, GGC, &ggc))
6106 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6107 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6109 } else if (dmar_map_gfx) {
6110 /* we have to ensure the gfx device is idle before we flush */
6111 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6112 intel_iommu_strict = 1;
6115 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6116 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6117 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6118 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6120 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6121 ISOCH DMAR unit for the Azalia sound device, but not give it any
6122 TLB entries, which causes it to deadlock. Check for that. We do
6123 this in a function called from init_dmars(), instead of in a PCI
6124 quirk, because we don't want to print the obnoxious "BIOS broken"
6125 message if VT-d is actually disabled.
6127 static void __init check_tylersburg_isoch(void)
6129 struct pci_dev *pdev;
6130 uint32_t vtisochctrl;
6132 /* If there's no Azalia in the system anyway, forget it. */
6133 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6137 if (risky_device(pdev)) {
6144 /* System Management Registers. Might be hidden, in which case
6145 we can't do the sanity check. But that's OK, because the
6146 known-broken BIOSes _don't_ actually hide it, so far. */
6147 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6151 if (risky_device(pdev)) {
6156 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6163 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6164 if (vtisochctrl & 1)
6167 /* Drop all bits other than the number of TLB entries */
6168 vtisochctrl &= 0x1c;
6170 /* If we have the recommended number of TLB entries (16), fine. */
6171 if (vtisochctrl == 0x10)
6174 /* Zero TLB entries? You get to ride the short bus to school. */
6176 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6177 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6178 dmi_get_system_info(DMI_BIOS_VENDOR),
6179 dmi_get_system_info(DMI_BIOS_VERSION),
6180 dmi_get_system_info(DMI_PRODUCT_VERSION));
6181 iommu_identity_mapping |= IDENTMAP_AZALIA;
6185 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",