1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-map-ops.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
50 #include "../irq_remapping.h"
53 #define ROOT_SIZE VTD_PAGE_SIZE
54 #define CONTEXT_SIZE VTD_PAGE_SIZE
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61 #define IOAPIC_RANGE_START (0xfee00000)
62 #define IOAPIC_RANGE_END (0xfeefffff)
63 #define IOVA_START_ADDR (0x1000)
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN (1)
82 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
84 /* page table handling */
85 #define LEVEL_STRIDE (9)
86 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
89 * This bitmap is used to advertise the page sizes our hardware support
90 * to the IOMMU core, which will then use this information to split
91 * physically contiguous memory regions it is mapping into page sizes
94 * Traditionally the IOMMU core just handed us the mappings directly,
95 * after making sure the size is an order of a 4KiB page and that the
96 * mapping has natural alignment.
98 * To retain this behavior, we currently advertise that we support
99 * all page sizes that are an order of 4KiB.
101 * If at some point we'd like to utilize the IOMMU core's new behavior,
102 * we could change this to advertise the real page sizes we support.
104 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
106 static inline int agaw_to_level(int agaw)
111 static inline int agaw_to_width(int agaw)
113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
116 static inline int width_to_agaw(int width)
118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
121 static inline unsigned int level_to_offset_bits(int level)
123 return (level - 1) * LEVEL_STRIDE;
126 static inline int pfn_level_offset(u64 pfn, int level)
128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
131 static inline u64 level_mask(int level)
133 return -1ULL << level_to_offset_bits(level);
136 static inline u64 level_size(int level)
138 return 1ULL << level_to_offset_bits(level);
141 static inline u64 align_to_level(u64 pfn, int level)
143 return (pfn + level_size(level) - 1) & level_mask(level);
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
148 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152 are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
164 return mm_to_dma_pfn(page_to_pfn(pg));
166 static inline unsigned long virt_to_dma_pfn(void *p)
168 return page_to_dma_pfn(virt_to_page(p));
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
178 * set to 1 to panic kernel if can't successfully enable VT-d
179 * (used when kernel is launched w/ TXT)
181 static int force_on = 0;
182 static int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
196 return re->lo & VTD_PAGE_MASK;
200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
208 return re->hi & VTD_PAGE_MASK;
211 static inline void context_clear_pasid_enable(struct context_entry *context)
213 context->lo &= ~(1ULL << 11);
216 static inline bool context_pasid_enabled(struct context_entry *context)
218 return !!(context->lo & (1ULL << 11));
221 static inline void context_set_copied(struct context_entry *context)
223 context->hi |= (1ull << 3);
226 static inline bool context_copied(struct context_entry *context)
228 return !!(context->hi & (1ULL << 3));
231 static inline bool __context_present(struct context_entry *context)
233 return (context->lo & 1);
236 bool context_present(struct context_entry *context)
238 return context_pasid_enabled(context) ?
239 __context_present(context) :
240 __context_present(context) && !context_copied(context);
243 static inline void context_set_present(struct context_entry *context)
248 static inline void context_set_fault_enable(struct context_entry *context)
250 context->lo &= (((u64)-1) << 2) | 1;
253 static inline void context_set_translation_type(struct context_entry *context,
256 context->lo &= (((u64)-1) << 4) | 3;
257 context->lo |= (value & 3) << 2;
260 static inline void context_set_address_root(struct context_entry *context,
263 context->lo &= ~VTD_PAGE_MASK;
264 context->lo |= value & VTD_PAGE_MASK;
267 static inline void context_set_address_width(struct context_entry *context,
270 context->hi |= value & 7;
273 static inline void context_set_domain_id(struct context_entry *context,
276 context->hi |= (value & ((1 << 16) - 1)) << 8;
279 static inline int context_domain_id(struct context_entry *c)
281 return((c->hi >> 8) & 0xffff);
284 static inline void context_clear_entry(struct context_entry *context)
291 * This domain is a statically identity mapping domain.
292 * 1. This domain creats a static 1:1 mapping to all usable memory.
293 * 2. It maps to each iommu if successful.
294 * 3. Each iommu mapps to this domain if successful.
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
299 #define for_each_domain_iommu(idx, domain) \
300 for (idx = 0; idx < g_num_of_iommus; idx++) \
301 if (domain->iommu_refcnt[idx])
303 struct dmar_rmrr_unit {
304 struct list_head list; /* list of rmrr units */
305 struct acpi_dmar_header *hdr; /* ACPI header */
306 u64 base_address; /* reserved base address*/
307 u64 end_address; /* reserved end address */
308 struct dmar_dev_scope *devices; /* target devices */
309 int devices_cnt; /* target device count */
312 struct dmar_atsr_unit {
313 struct list_head list; /* list of ATSR units */
314 struct acpi_dmar_header *hdr; /* ACPI header */
315 struct dmar_dev_scope *devices; /* target devices */
316 int devices_cnt; /* target device count */
317 u8 include_all:1; /* include all ports */
320 static LIST_HEAD(dmar_atsr_units);
321 static LIST_HEAD(dmar_rmrr_units);
323 #define for_each_rmrr_units(rmrr) \
324 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
329 static void domain_exit(struct dmar_domain *domain);
330 static void domain_remove_dev_info(struct dmar_domain *domain);
331 static void dmar_remove_one_dev_info(struct device *dev);
332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333 static int intel_iommu_attach_device(struct iommu_domain *domain,
335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339 int dmar_disabled = 0;
341 int dmar_disabled = 1;
342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345 int intel_iommu_sm = 1;
348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
350 int intel_iommu_enabled = 0;
351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
353 static int dmar_map_gfx = 1;
354 static int dmar_forcedac;
355 static int intel_iommu_strict;
356 static int intel_iommu_superpage = 1;
357 static int iommu_identity_mapping;
358 static int intel_no_bounce;
359 static int iommu_skip_te_disable;
361 #define IDENTMAP_GFX 2
362 #define IDENTMAP_AZALIA 4
364 int intel_iommu_gfx_mapped;
365 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
367 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
368 struct device_domain_info *get_domain_info(struct device *dev)
370 struct device_domain_info *info;
375 info = dev_iommu_priv_get(dev);
376 if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
382 DEFINE_SPINLOCK(device_domain_lock);
383 static LIST_HEAD(device_domain_list);
385 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \
386 to_pci_dev(d)->untrusted)
389 * Iterate over elements in device_domain_list and call the specified
390 * callback @fn against each element.
392 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
393 void *data), void *data)
397 struct device_domain_info *info;
399 spin_lock_irqsave(&device_domain_lock, flags);
400 list_for_each_entry(info, &device_domain_list, global) {
401 ret = fn(info, data);
403 spin_unlock_irqrestore(&device_domain_lock, flags);
407 spin_unlock_irqrestore(&device_domain_lock, flags);
412 const struct iommu_ops intel_iommu_ops;
414 static bool translation_pre_enabled(struct intel_iommu *iommu)
416 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
419 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
421 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
424 static void init_translation_status(struct intel_iommu *iommu)
428 gsts = readl(iommu->reg + DMAR_GSTS_REG);
429 if (gsts & DMA_GSTS_TES)
430 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
433 static int __init intel_iommu_setup(char *str)
438 if (!strncmp(str, "on", 2)) {
440 pr_info("IOMMU enabled\n");
441 } else if (!strncmp(str, "off", 3)) {
443 no_platform_optin = 1;
444 pr_info("IOMMU disabled\n");
445 } else if (!strncmp(str, "igfx_off", 8)) {
447 pr_info("Disable GFX device mapping\n");
448 } else if (!strncmp(str, "forcedac", 8)) {
449 pr_info("Forcing DAC for PCI devices\n");
451 } else if (!strncmp(str, "strict", 6)) {
452 pr_info("Disable batched IOTLB flush\n");
453 intel_iommu_strict = 1;
454 } else if (!strncmp(str, "sp_off", 6)) {
455 pr_info("Disable supported super page\n");
456 intel_iommu_superpage = 0;
457 } else if (!strncmp(str, "sm_on", 5)) {
458 pr_info("Intel-IOMMU: scalable mode supported\n");
460 } else if (!strncmp(str, "tboot_noforce", 13)) {
461 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
462 intel_iommu_tboot_noforce = 1;
463 } else if (!strncmp(str, "nobounce", 8)) {
464 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
468 str += strcspn(str, ",");
474 __setup("intel_iommu=", intel_iommu_setup);
476 static struct kmem_cache *iommu_domain_cache;
477 static struct kmem_cache *iommu_devinfo_cache;
479 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
481 struct dmar_domain **domains;
484 domains = iommu->domains[idx];
488 return domains[did & 0xff];
491 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
492 struct dmar_domain *domain)
494 struct dmar_domain **domains;
497 if (!iommu->domains[idx]) {
498 size_t size = 256 * sizeof(struct dmar_domain *);
499 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
502 domains = iommu->domains[idx];
503 if (WARN_ON(!domains))
506 domains[did & 0xff] = domain;
509 void *alloc_pgtable_page(int node)
514 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
516 vaddr = page_address(page);
520 void free_pgtable_page(void *vaddr)
522 free_page((unsigned long)vaddr);
525 static inline void *alloc_domain_mem(void)
527 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
530 static void free_domain_mem(void *vaddr)
532 kmem_cache_free(iommu_domain_cache, vaddr);
535 static inline void * alloc_devinfo_mem(void)
537 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
540 static inline void free_devinfo_mem(void *vaddr)
542 kmem_cache_free(iommu_devinfo_cache, vaddr);
545 static inline int domain_type_is_si(struct dmar_domain *domain)
547 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
550 static inline bool domain_use_first_level(struct dmar_domain *domain)
552 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
555 static inline int domain_pfn_supported(struct dmar_domain *domain,
558 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
560 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
564 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
565 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
566 * the returned SAGAW.
568 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
570 unsigned long fl_sagaw, sl_sagaw;
572 fl_sagaw = BIT(2) | (cap_5lp_support(iommu->cap) ? BIT(3) : 0);
573 sl_sagaw = cap_sagaw(iommu->cap);
575 /* Second level only. */
576 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
579 /* First level only. */
580 if (!ecap_slts(iommu->ecap))
583 return fl_sagaw & sl_sagaw;
586 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
591 sagaw = __iommu_calculate_sagaw(iommu);
592 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
593 if (test_bit(agaw, &sagaw))
601 * Calculate max SAGAW for each iommu.
603 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
605 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
609 * calculate agaw for each iommu.
610 * "SAGAW" may be different across iommus, use a default agaw, and
611 * get a supported less agaw for iommus that don't support the default agaw.
613 int iommu_calculate_agaw(struct intel_iommu *iommu)
615 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
618 /* This functionin only returns single iommu in a domain */
619 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
623 /* si_domain and vm domain should not get here. */
624 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
627 for_each_domain_iommu(iommu_id, domain)
630 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
633 return g_iommus[iommu_id];
636 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
638 return sm_supported(iommu) ?
639 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
642 static void domain_update_iommu_coherency(struct dmar_domain *domain)
644 struct dmar_drhd_unit *drhd;
645 struct intel_iommu *iommu;
649 domain->iommu_coherency = 1;
651 for_each_domain_iommu(i, domain) {
653 if (!iommu_paging_structure_coherency(g_iommus[i])) {
654 domain->iommu_coherency = 0;
661 /* No hardware attached; use lowest common denominator */
663 for_each_active_iommu(iommu, drhd) {
664 if (!iommu_paging_structure_coherency(iommu)) {
665 domain->iommu_coherency = 0;
672 static int domain_update_iommu_snooping(struct intel_iommu *skip)
674 struct dmar_drhd_unit *drhd;
675 struct intel_iommu *iommu;
679 for_each_active_iommu(iommu, drhd) {
682 * If the hardware is operating in the scalable mode,
683 * the snooping control is always supported since we
684 * always set PASID-table-entry.PGSNP bit if the domain
685 * is managed outside (UNMANAGED).
687 if (!sm_supported(iommu) &&
688 !ecap_sc_support(iommu->ecap)) {
699 static int domain_update_iommu_superpage(struct dmar_domain *domain,
700 struct intel_iommu *skip)
702 struct dmar_drhd_unit *drhd;
703 struct intel_iommu *iommu;
706 if (!intel_iommu_superpage) {
710 /* set iommu_superpage to the smallest common denominator */
712 for_each_active_iommu(iommu, drhd) {
714 if (domain && domain_use_first_level(domain)) {
715 if (!cap_fl1gp_support(iommu->cap))
718 mask &= cap_super_page_val(iommu->cap);
730 static int domain_update_device_node(struct dmar_domain *domain)
732 struct device_domain_info *info;
733 int nid = NUMA_NO_NODE;
735 assert_spin_locked(&device_domain_lock);
737 if (list_empty(&domain->devices))
740 list_for_each_entry(info, &domain->devices, link) {
745 * There could possibly be multiple device numa nodes as devices
746 * within the same domain may sit behind different IOMMUs. There
747 * isn't perfect answer in such situation, so we select first
748 * come first served policy.
750 nid = dev_to_node(info->dev);
751 if (nid != NUMA_NO_NODE)
758 /* Some capabilities may be different across iommus */
759 static void domain_update_iommu_cap(struct dmar_domain *domain)
761 domain_update_iommu_coherency(domain);
762 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
763 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
766 * If RHSA is missing, we should default to the device numa domain
769 if (domain->nid == NUMA_NO_NODE)
770 domain->nid = domain_update_device_node(domain);
773 * First-level translation restricts the input-address to a
774 * canonical address (i.e., address bits 63:N have the same
775 * value as address bit [N-1], where N is 48-bits with 4-level
776 * paging and 57-bits with 5-level paging). Hence, skip bit
779 if (domain_use_first_level(domain))
780 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
782 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
785 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
788 struct root_entry *root = &iommu->root_entry[bus];
789 struct context_entry *context;
793 if (sm_supported(iommu)) {
801 context = phys_to_virt(*entry & VTD_PAGE_MASK);
803 unsigned long phy_addr;
807 context = alloc_pgtable_page(iommu->node);
811 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
812 phy_addr = virt_to_phys((void *)context);
813 *entry = phy_addr | 1;
814 __iommu_flush_cache(iommu, entry, sizeof(*entry));
816 return &context[devfn];
819 static bool attach_deferred(struct device *dev)
821 return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
825 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
826 * sub-hierarchy of a candidate PCI-PCI bridge
827 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
828 * @bridge: the candidate PCI-PCI bridge
830 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
833 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
835 struct pci_dev *pdev, *pbridge;
837 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
840 pdev = to_pci_dev(dev);
841 pbridge = to_pci_dev(bridge);
843 if (pbridge->subordinate &&
844 pbridge->subordinate->number <= pdev->bus->number &&
845 pbridge->subordinate->busn_res.end >= pdev->bus->number)
851 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
853 struct dmar_drhd_unit *drhd;
857 /* We know that this device on this chipset has its own IOMMU.
858 * If we find it under a different IOMMU, then the BIOS is lying
859 * to us. Hope that the IOMMU for this device is actually
860 * disabled, and it needs no translation...
862 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
865 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
870 /* we know that the this iommu should be at offset 0xa000 from vtbar */
871 drhd = dmar_find_matched_drhd_unit(pdev);
872 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
873 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
874 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
881 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
883 if (!iommu || iommu->drhd->ignored)
886 if (dev_is_pci(dev)) {
887 struct pci_dev *pdev = to_pci_dev(dev);
889 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
890 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
891 quirk_ioat_snb_local_iommu(pdev))
898 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
900 struct dmar_drhd_unit *drhd = NULL;
901 struct pci_dev *pdev = NULL;
902 struct intel_iommu *iommu;
910 if (dev_is_pci(dev)) {
911 struct pci_dev *pf_pdev;
913 pdev = pci_real_dma_dev(to_pci_dev(dev));
915 /* VFs aren't listed in scope tables; we need to look up
916 * the PF instead to find the IOMMU. */
917 pf_pdev = pci_physfn(pdev);
919 segment = pci_domain_nr(pdev->bus);
920 } else if (has_acpi_companion(dev))
921 dev = &ACPI_COMPANION(dev)->dev;
924 for_each_iommu(iommu, drhd) {
925 if (pdev && segment != drhd->segment)
928 for_each_active_dev_scope(drhd->devices,
929 drhd->devices_cnt, i, tmp) {
931 /* For a VF use its original BDF# not that of the PF
932 * which we used for the IOMMU lookup. Strictly speaking
933 * we could do this for all PCI devices; we only need to
934 * get the BDF# from the scope table for ACPI matches. */
935 if (pdev && pdev->is_virtfn)
939 *bus = drhd->devices[i].bus;
940 *devfn = drhd->devices[i].devfn;
945 if (is_downstream_to_pci_bridge(dev, tmp))
949 if (pdev && drhd->include_all) {
952 *bus = pdev->bus->number;
953 *devfn = pdev->devfn;
960 if (iommu_is_dummy(iommu, dev))
968 static void domain_flush_cache(struct dmar_domain *domain,
969 void *addr, int size)
971 if (!domain->iommu_coherency)
972 clflush_cache_range(addr, size);
975 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
977 struct context_entry *context;
981 spin_lock_irqsave(&iommu->lock, flags);
982 context = iommu_context_addr(iommu, bus, devfn, 0);
984 ret = context_present(context);
985 spin_unlock_irqrestore(&iommu->lock, flags);
989 static void free_context_table(struct intel_iommu *iommu)
993 struct context_entry *context;
995 spin_lock_irqsave(&iommu->lock, flags);
996 if (!iommu->root_entry) {
999 for (i = 0; i < ROOT_ENTRY_NR; i++) {
1000 context = iommu_context_addr(iommu, i, 0, 0);
1002 free_pgtable_page(context);
1004 if (!sm_supported(iommu))
1007 context = iommu_context_addr(iommu, i, 0x80, 0);
1009 free_pgtable_page(context);
1012 free_pgtable_page(iommu->root_entry);
1013 iommu->root_entry = NULL;
1015 spin_unlock_irqrestore(&iommu->lock, flags);
1018 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1019 unsigned long pfn, int *target_level)
1021 struct dma_pte *parent, *pte;
1022 int level = agaw_to_level(domain->agaw);
1025 BUG_ON(!domain->pgd);
1027 if (!domain_pfn_supported(domain, pfn))
1028 /* Address beyond IOMMU's addressing capabilities. */
1031 parent = domain->pgd;
1036 offset = pfn_level_offset(pfn, level);
1037 pte = &parent[offset];
1038 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1040 if (level == *target_level)
1043 if (!dma_pte_present(pte)) {
1046 tmp_page = alloc_pgtable_page(domain->nid);
1051 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1052 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1053 if (domain_use_first_level(domain)) {
1054 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1055 if (domain->domain.type == IOMMU_DOMAIN_DMA)
1056 pteval |= DMA_FL_PTE_ACCESS;
1058 if (cmpxchg64(&pte->val, 0ULL, pteval))
1059 /* Someone else set it while we were thinking; use theirs. */
1060 free_pgtable_page(tmp_page);
1062 domain_flush_cache(domain, pte, sizeof(*pte));
1067 parent = phys_to_virt(dma_pte_addr(pte));
1072 *target_level = level;
1077 /* return address's pte at specific level */
1078 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1080 int level, int *large_page)
1082 struct dma_pte *parent, *pte;
1083 int total = agaw_to_level(domain->agaw);
1086 parent = domain->pgd;
1087 while (level <= total) {
1088 offset = pfn_level_offset(pfn, total);
1089 pte = &parent[offset];
1093 if (!dma_pte_present(pte)) {
1094 *large_page = total;
1098 if (dma_pte_superpage(pte)) {
1099 *large_page = total;
1103 parent = phys_to_virt(dma_pte_addr(pte));
1109 /* clear last level pte, a tlb flush should be followed */
1110 static void dma_pte_clear_range(struct dmar_domain *domain,
1111 unsigned long start_pfn,
1112 unsigned long last_pfn)
1114 unsigned int large_page;
1115 struct dma_pte *first_pte, *pte;
1117 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1118 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1119 BUG_ON(start_pfn > last_pfn);
1121 /* we don't need lock here; nobody else touches the iova range */
1124 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1126 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1131 start_pfn += lvl_to_nr_pages(large_page);
1133 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1135 domain_flush_cache(domain, first_pte,
1136 (void *)pte - (void *)first_pte);
1138 } while (start_pfn && start_pfn <= last_pfn);
1141 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1142 int retain_level, struct dma_pte *pte,
1143 unsigned long pfn, unsigned long start_pfn,
1144 unsigned long last_pfn)
1146 pfn = max(start_pfn, pfn);
1147 pte = &pte[pfn_level_offset(pfn, level)];
1150 unsigned long level_pfn;
1151 struct dma_pte *level_pte;
1153 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1156 level_pfn = pfn & level_mask(level);
1157 level_pte = phys_to_virt(dma_pte_addr(pte));
1160 dma_pte_free_level(domain, level - 1, retain_level,
1161 level_pte, level_pfn, start_pfn,
1166 * Free the page table if we're below the level we want to
1167 * retain and the range covers the entire table.
1169 if (level < retain_level && !(start_pfn > level_pfn ||
1170 last_pfn < level_pfn + level_size(level) - 1)) {
1172 domain_flush_cache(domain, pte, sizeof(*pte));
1173 free_pgtable_page(level_pte);
1176 pfn += level_size(level);
1177 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1181 * clear last level (leaf) ptes and free page table pages below the
1182 * level we wish to keep intact.
1184 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1185 unsigned long start_pfn,
1186 unsigned long last_pfn,
1189 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1190 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1191 BUG_ON(start_pfn > last_pfn);
1193 dma_pte_clear_range(domain, start_pfn, last_pfn);
1195 /* We don't need lock here; nobody else touches the iova range */
1196 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1197 domain->pgd, 0, start_pfn, last_pfn);
1200 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1201 free_pgtable_page(domain->pgd);
1206 /* When a page at a given level is being unlinked from its parent, we don't
1207 need to *modify* it at all. All we need to do is make a list of all the
1208 pages which can be freed just as soon as we've flushed the IOTLB and we
1209 know the hardware page-walk will no longer touch them.
1210 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1212 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1213 int level, struct dma_pte *pte,
1214 struct page *freelist)
1218 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1219 pg->freelist = freelist;
1225 pte = page_address(pg);
1227 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1228 freelist = dma_pte_list_pagetables(domain, level - 1,
1231 } while (!first_pte_in_page(pte));
1236 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1237 struct dma_pte *pte, unsigned long pfn,
1238 unsigned long start_pfn,
1239 unsigned long last_pfn,
1240 struct page *freelist)
1242 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1244 pfn = max(start_pfn, pfn);
1245 pte = &pte[pfn_level_offset(pfn, level)];
1248 unsigned long level_pfn;
1250 if (!dma_pte_present(pte))
1253 level_pfn = pfn & level_mask(level);
1255 /* If range covers entire pagetable, free it */
1256 if (start_pfn <= level_pfn &&
1257 last_pfn >= level_pfn + level_size(level) - 1) {
1258 /* These suborbinate page tables are going away entirely. Don't
1259 bother to clear them; we're just going to *free* them. */
1260 if (level > 1 && !dma_pte_superpage(pte))
1261 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1267 } else if (level > 1) {
1268 /* Recurse down into a level that isn't *entirely* obsolete */
1269 freelist = dma_pte_clear_level(domain, level - 1,
1270 phys_to_virt(dma_pte_addr(pte)),
1271 level_pfn, start_pfn, last_pfn,
1275 pfn += level_size(level);
1276 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1279 domain_flush_cache(domain, first_pte,
1280 (void *)++last_pte - (void *)first_pte);
1285 /* We can't just free the pages because the IOMMU may still be walking
1286 the page tables, and may have cached the intermediate levels. The
1287 pages can only be freed after the IOTLB flush has been done. */
1288 static struct page *domain_unmap(struct dmar_domain *domain,
1289 unsigned long start_pfn,
1290 unsigned long last_pfn)
1292 struct page *freelist;
1294 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1295 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1296 BUG_ON(start_pfn > last_pfn);
1298 /* we don't need lock here; nobody else touches the iova range */
1299 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1300 domain->pgd, 0, start_pfn, last_pfn, NULL);
1303 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1304 struct page *pgd_page = virt_to_page(domain->pgd);
1305 pgd_page->freelist = freelist;
1306 freelist = pgd_page;
1314 static void dma_free_pagelist(struct page *freelist)
1318 while ((pg = freelist)) {
1319 freelist = pg->freelist;
1320 free_pgtable_page(page_address(pg));
1324 static void iova_entry_free(unsigned long data)
1326 struct page *freelist = (struct page *)data;
1328 dma_free_pagelist(freelist);
1331 /* iommu handling */
1332 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1334 struct root_entry *root;
1335 unsigned long flags;
1337 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1339 pr_err("Allocating root entry for %s failed\n",
1344 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1346 spin_lock_irqsave(&iommu->lock, flags);
1347 iommu->root_entry = root;
1348 spin_unlock_irqrestore(&iommu->lock, flags);
1353 static void iommu_set_root_entry(struct intel_iommu *iommu)
1359 addr = virt_to_phys(iommu->root_entry);
1360 if (sm_supported(iommu))
1361 addr |= DMA_RTADDR_SMT;
1363 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1364 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1366 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1368 /* Make sure hardware complete it */
1369 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1370 readl, (sts & DMA_GSTS_RTPS), sts);
1372 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1374 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1375 if (sm_supported(iommu))
1376 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1377 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1380 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1385 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1388 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1389 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1391 /* Make sure hardware complete it */
1392 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1393 readl, (!(val & DMA_GSTS_WBFS)), val);
1395 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1398 /* return value determine if we need a write buffer flush */
1399 static void __iommu_flush_context(struct intel_iommu *iommu,
1400 u16 did, u16 source_id, u8 function_mask,
1407 case DMA_CCMD_GLOBAL_INVL:
1408 val = DMA_CCMD_GLOBAL_INVL;
1410 case DMA_CCMD_DOMAIN_INVL:
1411 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1413 case DMA_CCMD_DEVICE_INVL:
1414 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1415 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1420 val |= DMA_CCMD_ICC;
1422 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1423 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1425 /* Make sure hardware complete it */
1426 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1427 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1429 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1432 /* return value determine if we need a write buffer flush */
1433 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1434 u64 addr, unsigned int size_order, u64 type)
1436 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1437 u64 val = 0, val_iva = 0;
1441 case DMA_TLB_GLOBAL_FLUSH:
1442 /* global flush doesn't need set IVA_REG */
1443 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1445 case DMA_TLB_DSI_FLUSH:
1446 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1448 case DMA_TLB_PSI_FLUSH:
1449 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1450 /* IH bit is passed in as part of address */
1451 val_iva = size_order | addr;
1456 /* Note: set drain read/write */
1459 * This is probably to be super secure.. Looks like we can
1460 * ignore it without any impact.
1462 if (cap_read_drain(iommu->cap))
1463 val |= DMA_TLB_READ_DRAIN;
1465 if (cap_write_drain(iommu->cap))
1466 val |= DMA_TLB_WRITE_DRAIN;
1468 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1469 /* Note: Only uses first TLB reg currently */
1471 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1472 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1474 /* Make sure hardware complete it */
1475 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1476 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1478 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1480 /* check IOTLB invalidation granularity */
1481 if (DMA_TLB_IAIG(val) == 0)
1482 pr_err("Flush IOTLB failed\n");
1483 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1484 pr_debug("TLB flush request %Lx, actual %Lx\n",
1485 (unsigned long long)DMA_TLB_IIRG(type),
1486 (unsigned long long)DMA_TLB_IAIG(val));
1489 static struct device_domain_info *
1490 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1493 struct device_domain_info *info;
1495 assert_spin_locked(&device_domain_lock);
1500 list_for_each_entry(info, &domain->devices, link)
1501 if (info->iommu == iommu && info->bus == bus &&
1502 info->devfn == devfn) {
1503 if (info->ats_supported && info->dev)
1511 static void domain_update_iotlb(struct dmar_domain *domain)
1513 struct device_domain_info *info;
1514 bool has_iotlb_device = false;
1516 assert_spin_locked(&device_domain_lock);
1518 list_for_each_entry(info, &domain->devices, link) {
1519 struct pci_dev *pdev;
1521 if (!info->dev || !dev_is_pci(info->dev))
1524 pdev = to_pci_dev(info->dev);
1525 if (pdev->ats_enabled) {
1526 has_iotlb_device = true;
1531 domain->has_iotlb_device = has_iotlb_device;
1534 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1536 struct pci_dev *pdev;
1538 assert_spin_locked(&device_domain_lock);
1540 if (!info || !dev_is_pci(info->dev))
1543 pdev = to_pci_dev(info->dev);
1544 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1545 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1546 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1547 * reserved, which should be set to 0.
1549 if (!ecap_dit(info->iommu->ecap))
1552 struct pci_dev *pf_pdev;
1554 /* pdev will be returned if device is not a vf */
1555 pf_pdev = pci_physfn(pdev);
1556 info->pfsid = pci_dev_id(pf_pdev);
1559 #ifdef CONFIG_INTEL_IOMMU_SVM
1560 /* The PCIe spec, in its wisdom, declares that the behaviour of
1561 the device if you enable PASID support after ATS support is
1562 undefined. So always enable PASID support on devices which
1563 have it, even if we can't yet know if we're ever going to
1565 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1566 info->pasid_enabled = 1;
1568 if (info->pri_supported &&
1569 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1570 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1571 info->pri_enabled = 1;
1573 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1574 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1575 info->ats_enabled = 1;
1576 domain_update_iotlb(info->domain);
1577 info->ats_qdep = pci_ats_queue_depth(pdev);
1581 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1583 struct pci_dev *pdev;
1585 assert_spin_locked(&device_domain_lock);
1587 if (!dev_is_pci(info->dev))
1590 pdev = to_pci_dev(info->dev);
1592 if (info->ats_enabled) {
1593 pci_disable_ats(pdev);
1594 info->ats_enabled = 0;
1595 domain_update_iotlb(info->domain);
1597 #ifdef CONFIG_INTEL_IOMMU_SVM
1598 if (info->pri_enabled) {
1599 pci_disable_pri(pdev);
1600 info->pri_enabled = 0;
1602 if (info->pasid_enabled) {
1603 pci_disable_pasid(pdev);
1604 info->pasid_enabled = 0;
1609 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1610 u64 addr, unsigned mask)
1613 unsigned long flags;
1614 struct device_domain_info *info;
1616 if (!domain->has_iotlb_device)
1619 spin_lock_irqsave(&device_domain_lock, flags);
1620 list_for_each_entry(info, &domain->devices, link) {
1621 if (!info->ats_enabled)
1624 sid = info->bus << 8 | info->devfn;
1625 qdep = info->ats_qdep;
1626 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1629 spin_unlock_irqrestore(&device_domain_lock, flags);
1632 static void domain_flush_piotlb(struct intel_iommu *iommu,
1633 struct dmar_domain *domain,
1634 u64 addr, unsigned long npages, bool ih)
1636 u16 did = domain->iommu_did[iommu->seq_id];
1638 if (domain->default_pasid)
1639 qi_flush_piotlb(iommu, did, domain->default_pasid,
1642 if (!list_empty(&domain->devices))
1643 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1646 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1647 struct dmar_domain *domain,
1648 unsigned long pfn, unsigned int pages,
1651 unsigned int aligned_pages = __roundup_pow_of_two(pages);
1652 unsigned int mask = ilog2(aligned_pages);
1653 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1654 u16 did = domain->iommu_did[iommu->seq_id];
1661 if (domain_use_first_level(domain)) {
1662 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1664 unsigned long bitmask = aligned_pages - 1;
1667 * PSI masks the low order bits of the base address. If the
1668 * address isn't aligned to the mask, then compute a mask value
1669 * needed to ensure the target range is flushed.
1671 if (unlikely(bitmask & pfn)) {
1672 unsigned long end_pfn = pfn + pages - 1, shared_bits;
1675 * Since end_pfn <= pfn + bitmask, the only way bits
1676 * higher than bitmask can differ in pfn and end_pfn is
1677 * by carrying. This means after masking out bitmask,
1678 * high bits starting with the first set bit in
1679 * shared_bits are all equal in both pfn and end_pfn.
1681 shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1682 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1686 * Fallback to domain selective flush if no PSI support or
1687 * the size is too big.
1689 if (!cap_pgsel_inv(iommu->cap) ||
1690 mask > cap_max_amask_val(iommu->cap))
1691 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1694 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1699 * In caching mode, changes of pages from non-present to present require
1700 * flush. However, device IOTLB doesn't need to be flushed in this case.
1702 if (!cap_caching_mode(iommu->cap) || !map)
1703 iommu_flush_dev_iotlb(domain, addr, mask);
1706 /* Notification for newly created mappings */
1707 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1708 struct dmar_domain *domain,
1709 unsigned long pfn, unsigned int pages)
1712 * It's a non-present to present mapping. Only flush if caching mode
1715 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1716 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1718 iommu_flush_write_buffer(iommu);
1721 static void iommu_flush_iova(struct iova_domain *iovad)
1723 struct dmar_domain *domain;
1726 domain = container_of(iovad, struct dmar_domain, iovad);
1728 for_each_domain_iommu(idx, domain) {
1729 struct intel_iommu *iommu = g_iommus[idx];
1730 u16 did = domain->iommu_did[iommu->seq_id];
1732 if (domain_use_first_level(domain))
1733 domain_flush_piotlb(iommu, domain, 0, -1, 0);
1735 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1738 if (!cap_caching_mode(iommu->cap))
1739 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1740 0, MAX_AGAW_PFN_WIDTH);
1744 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1747 unsigned long flags;
1749 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1752 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1753 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1754 pmen &= ~DMA_PMEN_EPM;
1755 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1757 /* wait for the protected region status bit to clear */
1758 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1759 readl, !(pmen & DMA_PMEN_PRS), pmen);
1761 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1764 static void iommu_enable_translation(struct intel_iommu *iommu)
1767 unsigned long flags;
1769 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1770 iommu->gcmd |= DMA_GCMD_TE;
1771 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1773 /* Make sure hardware complete it */
1774 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1775 readl, (sts & DMA_GSTS_TES), sts);
1777 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1780 static void iommu_disable_translation(struct intel_iommu *iommu)
1785 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1786 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1789 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1790 iommu->gcmd &= ~DMA_GCMD_TE;
1791 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1793 /* Make sure hardware complete it */
1794 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1795 readl, (!(sts & DMA_GSTS_TES)), sts);
1797 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1800 static int iommu_init_domains(struct intel_iommu *iommu)
1802 u32 ndomains, nlongs;
1805 ndomains = cap_ndoms(iommu->cap);
1806 pr_debug("%s: Number of Domains supported <%d>\n",
1807 iommu->name, ndomains);
1808 nlongs = BITS_TO_LONGS(ndomains);
1810 spin_lock_init(&iommu->lock);
1812 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1813 if (!iommu->domain_ids) {
1814 pr_err("%s: Allocating domain id array failed\n",
1819 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1820 iommu->domains = kzalloc(size, GFP_KERNEL);
1822 if (iommu->domains) {
1823 size = 256 * sizeof(struct dmar_domain *);
1824 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1827 if (!iommu->domains || !iommu->domains[0]) {
1828 pr_err("%s: Allocating domain array failed\n",
1830 kfree(iommu->domain_ids);
1831 kfree(iommu->domains);
1832 iommu->domain_ids = NULL;
1833 iommu->domains = NULL;
1838 * If Caching mode is set, then invalid translations are tagged
1839 * with domain-id 0, hence we need to pre-allocate it. We also
1840 * use domain-id 0 as a marker for non-allocated domain-id, so
1841 * make sure it is not used for a real domain.
1843 set_bit(0, iommu->domain_ids);
1846 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1847 * entry for first-level or pass-through translation modes should
1848 * be programmed with a domain id different from those used for
1849 * second-level or nested translation. We reserve a domain id for
1852 if (sm_supported(iommu))
1853 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1858 static void disable_dmar_iommu(struct intel_iommu *iommu)
1860 struct device_domain_info *info, *tmp;
1861 unsigned long flags;
1863 if (!iommu->domains || !iommu->domain_ids)
1866 spin_lock_irqsave(&device_domain_lock, flags);
1867 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1868 if (info->iommu != iommu)
1871 if (!info->dev || !info->domain)
1874 __dmar_remove_one_dev_info(info);
1876 spin_unlock_irqrestore(&device_domain_lock, flags);
1878 if (iommu->gcmd & DMA_GCMD_TE)
1879 iommu_disable_translation(iommu);
1882 static void free_dmar_iommu(struct intel_iommu *iommu)
1884 if ((iommu->domains) && (iommu->domain_ids)) {
1885 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1888 for (i = 0; i < elems; i++)
1889 kfree(iommu->domains[i]);
1890 kfree(iommu->domains);
1891 kfree(iommu->domain_ids);
1892 iommu->domains = NULL;
1893 iommu->domain_ids = NULL;
1896 g_iommus[iommu->seq_id] = NULL;
1898 /* free context mapping */
1899 free_context_table(iommu);
1901 #ifdef CONFIG_INTEL_IOMMU_SVM
1902 if (pasid_supported(iommu)) {
1903 if (ecap_prs(iommu->ecap))
1904 intel_svm_finish_prq(iommu);
1906 if (vccap_pasid(iommu->vccap))
1907 ioasid_unregister_allocator(&iommu->pasid_allocator);
1913 * Check and return whether first level is used by default for
1916 static bool first_level_by_default(void)
1918 struct dmar_drhd_unit *drhd;
1919 struct intel_iommu *iommu;
1920 static int first_level_support = -1;
1922 if (likely(first_level_support != -1))
1923 return first_level_support;
1925 first_level_support = 1;
1928 for_each_active_iommu(iommu, drhd) {
1929 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1930 first_level_support = 0;
1936 return first_level_support;
1939 static struct dmar_domain *alloc_domain(int flags)
1941 struct dmar_domain *domain;
1943 domain = alloc_domain_mem();
1947 memset(domain, 0, sizeof(*domain));
1948 domain->nid = NUMA_NO_NODE;
1949 domain->flags = flags;
1950 if (first_level_by_default())
1951 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1952 domain->has_iotlb_device = false;
1953 INIT_LIST_HEAD(&domain->devices);
1958 /* Must be called with iommu->lock */
1959 static int domain_attach_iommu(struct dmar_domain *domain,
1960 struct intel_iommu *iommu)
1962 unsigned long ndomains;
1965 assert_spin_locked(&device_domain_lock);
1966 assert_spin_locked(&iommu->lock);
1968 domain->iommu_refcnt[iommu->seq_id] += 1;
1969 domain->iommu_count += 1;
1970 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1971 ndomains = cap_ndoms(iommu->cap);
1972 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1974 if (num >= ndomains) {
1975 pr_err("%s: No free domain ids\n", iommu->name);
1976 domain->iommu_refcnt[iommu->seq_id] -= 1;
1977 domain->iommu_count -= 1;
1981 set_bit(num, iommu->domain_ids);
1982 set_iommu_domain(iommu, num, domain);
1984 domain->iommu_did[iommu->seq_id] = num;
1985 domain->nid = iommu->node;
1987 domain_update_iommu_cap(domain);
1993 static int domain_detach_iommu(struct dmar_domain *domain,
1994 struct intel_iommu *iommu)
1998 assert_spin_locked(&device_domain_lock);
1999 assert_spin_locked(&iommu->lock);
2001 domain->iommu_refcnt[iommu->seq_id] -= 1;
2002 count = --domain->iommu_count;
2003 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
2004 num = domain->iommu_did[iommu->seq_id];
2005 clear_bit(num, iommu->domain_ids);
2006 set_iommu_domain(iommu, num, NULL);
2008 domain_update_iommu_cap(domain);
2009 domain->iommu_did[iommu->seq_id] = 0;
2015 static struct iova_domain reserved_iova_list;
2016 static struct lock_class_key reserved_rbtree_key;
2018 static int dmar_init_reserved_ranges(void)
2020 struct pci_dev *pdev = NULL;
2024 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
2026 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
2027 &reserved_rbtree_key);
2029 /* IOAPIC ranges shouldn't be accessed by DMA */
2030 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
2031 IOVA_PFN(IOAPIC_RANGE_END));
2033 pr_err("Reserve IOAPIC range failed\n");
2037 /* Reserve all PCI MMIO to avoid peer-to-peer access */
2038 for_each_pci_dev(pdev) {
2041 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
2042 r = &pdev->resource[i];
2043 if (!r->flags || !(r->flags & IORESOURCE_MEM))
2045 iova = reserve_iova(&reserved_iova_list,
2049 pci_err(pdev, "Reserve iova for %pR failed\n", r);
2057 static inline int guestwidth_to_adjustwidth(int gaw)
2060 int r = (gaw - 12) % 9;
2071 static void domain_exit(struct dmar_domain *domain)
2074 /* Remove associated devices and clear attached or cached domains */
2075 domain_remove_dev_info(domain);
2078 if (domain->domain.type == IOMMU_DOMAIN_DMA)
2079 put_iova_domain(&domain->iovad);
2082 struct page *freelist;
2084 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2085 dma_free_pagelist(freelist);
2088 free_domain_mem(domain);
2092 * Get the PASID directory size for scalable mode context entry.
2093 * Value of X in the PDTS field of a scalable mode context entry
2094 * indicates PASID directory with 2^(X + 7) entries.
2096 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2100 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2101 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2109 * Set the RID_PASID field of a scalable mode context entry. The
2110 * IOMMU hardware will use the PASID value set in this field for
2111 * DMA translations of DMA requests without PASID.
2114 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2116 context->hi |= pasid & ((1 << 20) - 1);
2120 * Set the DTE(Device-TLB Enable) field of a scalable mode context
2123 static inline void context_set_sm_dte(struct context_entry *context)
2125 context->lo |= (1 << 2);
2129 * Set the PRE(Page Request Enable) field of a scalable mode context
2132 static inline void context_set_sm_pre(struct context_entry *context)
2134 context->lo |= (1 << 4);
2137 /* Convert value to context PASID directory size field coding. */
2138 #define context_pdts(pds) (((pds) & 0x7) << 9)
2140 static int domain_context_mapping_one(struct dmar_domain *domain,
2141 struct intel_iommu *iommu,
2142 struct pasid_table *table,
2145 u16 did = domain->iommu_did[iommu->seq_id];
2146 int translation = CONTEXT_TT_MULTI_LEVEL;
2147 struct device_domain_info *info = NULL;
2148 struct context_entry *context;
2149 unsigned long flags;
2154 if (hw_pass_through && domain_type_is_si(domain))
2155 translation = CONTEXT_TT_PASS_THROUGH;
2157 pr_debug("Set context mapping for %02x:%02x.%d\n",
2158 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2160 BUG_ON(!domain->pgd);
2162 spin_lock_irqsave(&device_domain_lock, flags);
2163 spin_lock(&iommu->lock);
2166 context = iommu_context_addr(iommu, bus, devfn, 1);
2171 if (context_present(context))
2175 * For kdump cases, old valid entries may be cached due to the
2176 * in-flight DMA and copied pgtable, but there is no unmapping
2177 * behaviour for them, thus we need an explicit cache flush for
2178 * the newly-mapped device. For kdump, at this point, the device
2179 * is supposed to finish reset at its driver probe stage, so no
2180 * in-flight DMA will exist, and we don't need to worry anymore
2183 if (context_copied(context)) {
2184 u16 did_old = context_domain_id(context);
2186 if (did_old < cap_ndoms(iommu->cap)) {
2187 iommu->flush.flush_context(iommu, did_old,
2188 (((u16)bus) << 8) | devfn,
2189 DMA_CCMD_MASK_NOBIT,
2190 DMA_CCMD_DEVICE_INVL);
2191 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2196 context_clear_entry(context);
2198 if (sm_supported(iommu)) {
2203 /* Setup the PASID DIR pointer: */
2204 pds = context_get_sm_pds(table);
2205 context->lo = (u64)virt_to_phys(table->table) |
2208 /* Setup the RID_PASID field: */
2209 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2212 * Setup the Device-TLB enable bit and Page request
2215 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2216 if (info && info->ats_supported)
2217 context_set_sm_dte(context);
2218 if (info && info->pri_supported)
2219 context_set_sm_pre(context);
2221 struct dma_pte *pgd = domain->pgd;
2224 context_set_domain_id(context, did);
2226 if (translation != CONTEXT_TT_PASS_THROUGH) {
2228 * Skip top levels of page tables for iommu which has
2229 * less agaw than default. Unnecessary for PT mode.
2231 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2233 pgd = phys_to_virt(dma_pte_addr(pgd));
2234 if (!dma_pte_present(pgd))
2238 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2239 if (info && info->ats_supported)
2240 translation = CONTEXT_TT_DEV_IOTLB;
2242 translation = CONTEXT_TT_MULTI_LEVEL;
2244 context_set_address_root(context, virt_to_phys(pgd));
2245 context_set_address_width(context, agaw);
2248 * In pass through mode, AW must be programmed to
2249 * indicate the largest AGAW value supported by
2250 * hardware. And ASR is ignored by hardware.
2252 context_set_address_width(context, iommu->msagaw);
2255 context_set_translation_type(context, translation);
2258 context_set_fault_enable(context);
2259 context_set_present(context);
2260 if (!ecap_coherent(iommu->ecap))
2261 clflush_cache_range(context, sizeof(*context));
2264 * It's a non-present to present mapping. If hardware doesn't cache
2265 * non-present entry we only need to flush the write-buffer. If the
2266 * _does_ cache non-present entries, then it does so in the special
2267 * domain #0, which we have to flush:
2269 if (cap_caching_mode(iommu->cap)) {
2270 iommu->flush.flush_context(iommu, 0,
2271 (((u16)bus) << 8) | devfn,
2272 DMA_CCMD_MASK_NOBIT,
2273 DMA_CCMD_DEVICE_INVL);
2274 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2276 iommu_flush_write_buffer(iommu);
2278 iommu_enable_dev_iotlb(info);
2283 spin_unlock(&iommu->lock);
2284 spin_unlock_irqrestore(&device_domain_lock, flags);
2289 struct domain_context_mapping_data {
2290 struct dmar_domain *domain;
2291 struct intel_iommu *iommu;
2292 struct pasid_table *table;
2295 static int domain_context_mapping_cb(struct pci_dev *pdev,
2296 u16 alias, void *opaque)
2298 struct domain_context_mapping_data *data = opaque;
2300 return domain_context_mapping_one(data->domain, data->iommu,
2301 data->table, PCI_BUS_NUM(alias),
2306 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2308 struct domain_context_mapping_data data;
2309 struct pasid_table *table;
2310 struct intel_iommu *iommu;
2313 iommu = device_to_iommu(dev, &bus, &devfn);
2317 table = intel_pasid_get_table(dev);
2319 if (!dev_is_pci(dev))
2320 return domain_context_mapping_one(domain, iommu, table,
2323 data.domain = domain;
2327 return pci_for_each_dma_alias(to_pci_dev(dev),
2328 &domain_context_mapping_cb, &data);
2331 static int domain_context_mapped_cb(struct pci_dev *pdev,
2332 u16 alias, void *opaque)
2334 struct intel_iommu *iommu = opaque;
2336 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2339 static int domain_context_mapped(struct device *dev)
2341 struct intel_iommu *iommu;
2344 iommu = device_to_iommu(dev, &bus, &devfn);
2348 if (!dev_is_pci(dev))
2349 return device_context_mapped(iommu, bus, devfn);
2351 return !pci_for_each_dma_alias(to_pci_dev(dev),
2352 domain_context_mapped_cb, iommu);
2355 /* Returns a number of VTD pages, but aligned to MM page size */
2356 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2359 host_addr &= ~PAGE_MASK;
2360 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2363 /* Return largest possible superpage level for a given mapping */
2364 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2365 unsigned long iov_pfn,
2366 unsigned long phy_pfn,
2367 unsigned long pages)
2369 int support, level = 1;
2370 unsigned long pfnmerge;
2372 support = domain->iommu_superpage;
2374 /* To use a large page, the virtual *and* physical addresses
2375 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2376 of them will mean we have to use smaller pages. So just
2377 merge them and check both at once. */
2378 pfnmerge = iov_pfn | phy_pfn;
2380 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2381 pages >>= VTD_STRIDE_SHIFT;
2384 pfnmerge >>= VTD_STRIDE_SHIFT;
2391 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2392 struct scatterlist *sg, unsigned long phys_pfn,
2393 unsigned long nr_pages, int prot)
2395 struct dma_pte *first_pte = NULL, *pte = NULL;
2397 unsigned long sg_res = 0;
2398 unsigned int largepage_lvl = 0;
2399 unsigned long lvl_pages = 0;
2402 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2404 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2407 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2408 attr |= DMA_FL_PTE_PRESENT;
2409 if (domain_use_first_level(domain)) {
2410 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
2412 if (domain->domain.type == IOMMU_DOMAIN_DMA) {
2413 attr |= DMA_FL_PTE_ACCESS;
2414 if (prot & DMA_PTE_WRITE)
2415 attr |= DMA_FL_PTE_DIRTY;
2421 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2424 while (nr_pages > 0) {
2428 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2430 sg_res = aligned_nrpages(sg->offset, sg->length);
2431 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2432 sg->dma_length = sg->length;
2433 pteval = (sg_phys(sg) - pgoff) | attr;
2434 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2438 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2440 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2443 /* It is large page*/
2444 if (largepage_lvl > 1) {
2445 unsigned long nr_superpages, end_pfn;
2447 pteval |= DMA_PTE_LARGE_PAGE;
2448 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2450 nr_superpages = sg_res / lvl_pages;
2451 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2454 * Ensure that old small page tables are
2455 * removed to make room for superpage(s).
2456 * We're adding new large pages, so make sure
2457 * we don't remove their parent tables.
2459 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2462 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2466 /* We don't need lock here, nobody else
2467 * touches the iova range
2469 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2471 static int dumps = 5;
2472 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2473 iov_pfn, tmp, (unsigned long long)pteval);
2476 debug_dma_dump_mappings(NULL);
2481 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2483 BUG_ON(nr_pages < lvl_pages);
2484 BUG_ON(sg_res < lvl_pages);
2486 nr_pages -= lvl_pages;
2487 iov_pfn += lvl_pages;
2488 phys_pfn += lvl_pages;
2489 pteval += lvl_pages * VTD_PAGE_SIZE;
2490 sg_res -= lvl_pages;
2492 /* If the next PTE would be the first in a new page, then we
2493 need to flush the cache on the entries we've just written.
2494 And then we'll need to recalculate 'pte', so clear it and
2495 let it get set again in the if (!pte) block above.
2497 If we're done (!nr_pages) we need to flush the cache too.
2499 Also if we've been setting superpages, we may need to
2500 recalculate 'pte' and switch back to smaller pages for the
2501 end of the mapping, if the trailing size is not enough to
2502 use another superpage (i.e. sg_res < lvl_pages). */
2504 if (!nr_pages || first_pte_in_page(pte) ||
2505 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2506 domain_flush_cache(domain, first_pte,
2507 (void *)pte - (void *)first_pte);
2511 if (!sg_res && nr_pages)
2517 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2518 struct scatterlist *sg, unsigned long phys_pfn,
2519 unsigned long nr_pages, int prot)
2522 struct intel_iommu *iommu;
2524 /* Do the real mapping first */
2525 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2529 for_each_domain_iommu(iommu_id, domain) {
2530 iommu = g_iommus[iommu_id];
2531 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2537 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2538 struct scatterlist *sg, unsigned long nr_pages,
2541 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2544 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2545 unsigned long phys_pfn, unsigned long nr_pages,
2548 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2551 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2553 unsigned long flags;
2554 struct context_entry *context;
2560 spin_lock_irqsave(&iommu->lock, flags);
2561 context = iommu_context_addr(iommu, bus, devfn, 0);
2563 spin_unlock_irqrestore(&iommu->lock, flags);
2566 did_old = context_domain_id(context);
2567 context_clear_entry(context);
2568 __iommu_flush_cache(iommu, context, sizeof(*context));
2569 spin_unlock_irqrestore(&iommu->lock, flags);
2570 iommu->flush.flush_context(iommu,
2572 (((u16)bus) << 8) | devfn,
2573 DMA_CCMD_MASK_NOBIT,
2574 DMA_CCMD_DEVICE_INVL);
2576 if (sm_supported(iommu))
2577 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2579 iommu->flush.flush_iotlb(iommu,
2586 static inline void unlink_domain_info(struct device_domain_info *info)
2588 assert_spin_locked(&device_domain_lock);
2589 list_del(&info->link);
2590 list_del(&info->global);
2592 dev_iommu_priv_set(info->dev, NULL);
2595 static void domain_remove_dev_info(struct dmar_domain *domain)
2597 struct device_domain_info *info, *tmp;
2598 unsigned long flags;
2600 spin_lock_irqsave(&device_domain_lock, flags);
2601 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2602 __dmar_remove_one_dev_info(info);
2603 spin_unlock_irqrestore(&device_domain_lock, flags);
2606 struct dmar_domain *find_domain(struct device *dev)
2608 struct device_domain_info *info;
2610 if (unlikely(!dev || !dev->iommu))
2613 if (unlikely(attach_deferred(dev)))
2616 /* No lock here, assumes no domain exit in normal case */
2617 info = get_domain_info(dev);
2619 return info->domain;
2624 static void do_deferred_attach(struct device *dev)
2626 struct iommu_domain *domain;
2628 dev_iommu_priv_set(dev, NULL);
2629 domain = iommu_get_domain_for_dev(dev);
2631 intel_iommu_attach_device(domain, dev);
2634 static inline struct device_domain_info *
2635 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2637 struct device_domain_info *info;
2639 list_for_each_entry(info, &device_domain_list, global)
2640 if (info->segment == segment && info->bus == bus &&
2641 info->devfn == devfn)
2647 static int domain_setup_first_level(struct intel_iommu *iommu,
2648 struct dmar_domain *domain,
2652 struct dma_pte *pgd = domain->pgd;
2657 * Skip top levels of page tables for iommu which has
2658 * less agaw than default. Unnecessary for PT mode.
2660 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2661 pgd = phys_to_virt(dma_pte_addr(pgd));
2662 if (!dma_pte_present(pgd))
2666 level = agaw_to_level(agaw);
2667 if (level != 4 && level != 5)
2670 if (pasid != PASID_RID2PASID)
2671 flags |= PASID_FLAG_SUPERVISOR_MODE;
2673 flags |= PASID_FLAG_FL5LP;
2675 if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
2676 flags |= PASID_FLAG_PAGE_SNOOP;
2678 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2679 domain->iommu_did[iommu->seq_id],
2683 static bool dev_is_real_dma_subdevice(struct device *dev)
2685 return dev && dev_is_pci(dev) &&
2686 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2689 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2692 struct dmar_domain *domain)
2694 struct dmar_domain *found = NULL;
2695 struct device_domain_info *info;
2696 unsigned long flags;
2699 info = alloc_devinfo_mem();
2703 if (!dev_is_real_dma_subdevice(dev)) {
2705 info->devfn = devfn;
2706 info->segment = iommu->segment;
2708 struct pci_dev *pdev = to_pci_dev(dev);
2710 info->bus = pdev->bus->number;
2711 info->devfn = pdev->devfn;
2712 info->segment = pci_domain_nr(pdev->bus);
2715 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2716 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2719 info->domain = domain;
2720 info->iommu = iommu;
2721 info->pasid_table = NULL;
2722 info->auxd_enabled = 0;
2723 INIT_LIST_HEAD(&info->auxiliary_domains);
2725 if (dev && dev_is_pci(dev)) {
2726 struct pci_dev *pdev = to_pci_dev(info->dev);
2728 if (ecap_dev_iotlb_support(iommu->ecap) &&
2729 pci_ats_supported(pdev) &&
2730 dmar_find_matched_atsr_unit(pdev))
2731 info->ats_supported = 1;
2733 if (sm_supported(iommu)) {
2734 if (pasid_supported(iommu)) {
2735 int features = pci_pasid_features(pdev);
2737 info->pasid_supported = features | 1;
2740 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2741 pci_pri_supported(pdev))
2742 info->pri_supported = 1;
2746 spin_lock_irqsave(&device_domain_lock, flags);
2748 found = find_domain(dev);
2751 struct device_domain_info *info2;
2752 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2755 found = info2->domain;
2761 spin_unlock_irqrestore(&device_domain_lock, flags);
2762 free_devinfo_mem(info);
2763 /* Caller must free the original domain */
2767 spin_lock(&iommu->lock);
2768 ret = domain_attach_iommu(domain, iommu);
2769 spin_unlock(&iommu->lock);
2772 spin_unlock_irqrestore(&device_domain_lock, flags);
2773 free_devinfo_mem(info);
2777 list_add(&info->link, &domain->devices);
2778 list_add(&info->global, &device_domain_list);
2780 dev_iommu_priv_set(dev, info);
2781 spin_unlock_irqrestore(&device_domain_lock, flags);
2783 /* PASID table is mandatory for a PCI device in scalable mode. */
2784 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2785 ret = intel_pasid_alloc_table(dev);
2787 dev_err(dev, "PASID table allocation failed\n");
2788 dmar_remove_one_dev_info(dev);
2792 /* Setup the PASID entry for requests without PASID: */
2793 spin_lock_irqsave(&iommu->lock, flags);
2794 if (hw_pass_through && domain_type_is_si(domain))
2795 ret = intel_pasid_setup_pass_through(iommu, domain,
2796 dev, PASID_RID2PASID);
2797 else if (domain_use_first_level(domain))
2798 ret = domain_setup_first_level(iommu, domain, dev,
2801 ret = intel_pasid_setup_second_level(iommu, domain,
2802 dev, PASID_RID2PASID);
2803 spin_unlock_irqrestore(&iommu->lock, flags);
2805 dev_err(dev, "Setup RID2PASID failed\n");
2806 dmar_remove_one_dev_info(dev);
2811 if (dev && domain_context_mapping(domain, dev)) {
2812 dev_err(dev, "Domain context map failed\n");
2813 dmar_remove_one_dev_info(dev);
2820 static int iommu_domain_identity_map(struct dmar_domain *domain,
2821 unsigned long first_vpfn,
2822 unsigned long last_vpfn)
2825 * RMRR range might have overlap with physical memory range,
2828 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2830 return __domain_mapping(domain, first_vpfn, NULL,
2831 first_vpfn, last_vpfn - first_vpfn + 1,
2832 DMA_PTE_READ|DMA_PTE_WRITE);
2835 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2837 static int __init si_domain_init(int hw)
2839 struct dmar_rmrr_unit *rmrr;
2843 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2847 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2848 domain_exit(si_domain);
2856 for_each_online_node(nid) {
2857 unsigned long start_pfn, end_pfn;
2860 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2861 ret = iommu_domain_identity_map(si_domain,
2862 mm_to_dma_pfn(start_pfn),
2863 mm_to_dma_pfn(end_pfn));
2870 * Identity map the RMRRs so that devices with RMRRs could also use
2873 for_each_rmrr_units(rmrr) {
2874 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2876 unsigned long long start = rmrr->base_address;
2877 unsigned long long end = rmrr->end_address;
2879 if (WARN_ON(end < start ||
2880 end >> agaw_to_width(si_domain->agaw)))
2883 ret = iommu_domain_identity_map(si_domain,
2884 mm_to_dma_pfn(start >> PAGE_SHIFT),
2885 mm_to_dma_pfn(end >> PAGE_SHIFT));
2894 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2896 struct dmar_domain *ndomain;
2897 struct intel_iommu *iommu;
2900 iommu = device_to_iommu(dev, &bus, &devfn);
2904 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2905 if (ndomain != domain)
2911 static bool device_has_rmrr(struct device *dev)
2913 struct dmar_rmrr_unit *rmrr;
2918 for_each_rmrr_units(rmrr) {
2920 * Return TRUE if this RMRR contains the device that
2923 for_each_active_dev_scope(rmrr->devices,
2924 rmrr->devices_cnt, i, tmp)
2926 is_downstream_to_pci_bridge(dev, tmp)) {
2936 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2937 * is relaxable (ie. is allowed to be not enforced under some conditions)
2938 * @dev: device handle
2940 * We assume that PCI USB devices with RMRRs have them largely
2941 * for historical reasons and that the RMRR space is not actively used post
2942 * boot. This exclusion may change if vendors begin to abuse it.
2944 * The same exception is made for graphics devices, with the requirement that
2945 * any use of the RMRR regions will be torn down before assigning the device
2948 * Return: true if the RMRR is relaxable, false otherwise
2950 static bool device_rmrr_is_relaxable(struct device *dev)
2952 struct pci_dev *pdev;
2954 if (!dev_is_pci(dev))
2957 pdev = to_pci_dev(dev);
2958 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2965 * There are a couple cases where we need to restrict the functionality of
2966 * devices associated with RMRRs. The first is when evaluating a device for
2967 * identity mapping because problems exist when devices are moved in and out
2968 * of domains and their respective RMRR information is lost. This means that
2969 * a device with associated RMRRs will never be in a "passthrough" domain.
2970 * The second is use of the device through the IOMMU API. This interface
2971 * expects to have full control of the IOVA space for the device. We cannot
2972 * satisfy both the requirement that RMRR access is maintained and have an
2973 * unencumbered IOVA space. We also have no ability to quiesce the device's
2974 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2975 * We therefore prevent devices associated with an RMRR from participating in
2976 * the IOMMU API, which eliminates them from device assignment.
2978 * In both cases, devices which have relaxable RMRRs are not concerned by this
2979 * restriction. See device_rmrr_is_relaxable comment.
2981 static bool device_is_rmrr_locked(struct device *dev)
2983 if (!device_has_rmrr(dev))
2986 if (device_rmrr_is_relaxable(dev))
2993 * Return the required default domain type for a specific device.
2995 * @dev: the device in query
2996 * @startup: true if this is during early boot
2999 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
3000 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
3001 * - 0: both identity and dynamic domains work for this device
3003 static int device_def_domain_type(struct device *dev)
3005 if (dev_is_pci(dev)) {
3006 struct pci_dev *pdev = to_pci_dev(dev);
3009 * Prevent any device marked as untrusted from getting
3010 * placed into the statically identity mapping domain.
3012 if (pdev->untrusted)
3013 return IOMMU_DOMAIN_DMA;
3015 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
3016 return IOMMU_DOMAIN_IDENTITY;
3018 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
3019 return IOMMU_DOMAIN_IDENTITY;
3025 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3028 * Start from the sane iommu hardware state.
3029 * If the queued invalidation is already initialized by us
3030 * (for example, while enabling interrupt-remapping) then
3031 * we got the things already rolling from a sane state.
3035 * Clear any previous faults.
3037 dmar_fault(-1, iommu);
3039 * Disable queued invalidation if supported and already enabled
3040 * before OS handover.
3042 dmar_disable_qi(iommu);
3045 if (dmar_enable_qi(iommu)) {
3047 * Queued Invalidate not enabled, use Register Based Invalidate
3049 iommu->flush.flush_context = __iommu_flush_context;
3050 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3051 pr_info("%s: Using Register based invalidation\n",
3054 iommu->flush.flush_context = qi_flush_context;
3055 iommu->flush.flush_iotlb = qi_flush_iotlb;
3056 pr_info("%s: Using Queued invalidation\n", iommu->name);
3060 static int copy_context_table(struct intel_iommu *iommu,
3061 struct root_entry *old_re,
3062 struct context_entry **tbl,
3065 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3066 struct context_entry *new_ce = NULL, ce;
3067 struct context_entry *old_ce = NULL;
3068 struct root_entry re;
3069 phys_addr_t old_ce_phys;
3071 tbl_idx = ext ? bus * 2 : bus;
3072 memcpy(&re, old_re, sizeof(re));
3074 for (devfn = 0; devfn < 256; devfn++) {
3075 /* First calculate the correct index */
3076 idx = (ext ? devfn * 2 : devfn) % 256;
3079 /* First save what we may have and clean up */
3081 tbl[tbl_idx] = new_ce;
3082 __iommu_flush_cache(iommu, new_ce,
3092 old_ce_phys = root_entry_lctp(&re);
3094 old_ce_phys = root_entry_uctp(&re);
3097 if (ext && devfn == 0) {
3098 /* No LCTP, try UCTP */
3107 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3112 new_ce = alloc_pgtable_page(iommu->node);
3119 /* Now copy the context entry */
3120 memcpy(&ce, old_ce + idx, sizeof(ce));
3122 if (!__context_present(&ce))
3125 did = context_domain_id(&ce);
3126 if (did >= 0 && did < cap_ndoms(iommu->cap))
3127 set_bit(did, iommu->domain_ids);
3130 * We need a marker for copied context entries. This
3131 * marker needs to work for the old format as well as
3132 * for extended context entries.
3134 * Bit 67 of the context entry is used. In the old
3135 * format this bit is available to software, in the
3136 * extended format it is the PGE bit, but PGE is ignored
3137 * by HW if PASIDs are disabled (and thus still
3140 * So disable PASIDs first and then mark the entry
3141 * copied. This means that we don't copy PASID
3142 * translations from the old kernel, but this is fine as
3143 * faults there are not fatal.
3145 context_clear_pasid_enable(&ce);
3146 context_set_copied(&ce);
3151 tbl[tbl_idx + pos] = new_ce;
3153 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3162 static int copy_translation_tables(struct intel_iommu *iommu)
3164 struct context_entry **ctxt_tbls;
3165 struct root_entry *old_rt;
3166 phys_addr_t old_rt_phys;
3167 int ctxt_table_entries;
3168 unsigned long flags;
3173 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3174 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3175 new_ext = !!ecap_ecs(iommu->ecap);
3178 * The RTT bit can only be changed when translation is disabled,
3179 * but disabling translation means to open a window for data
3180 * corruption. So bail out and don't copy anything if we would
3181 * have to change the bit.
3186 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3190 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3194 /* This is too big for the stack - allocate it from slab */
3195 ctxt_table_entries = ext ? 512 : 256;
3197 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3201 for (bus = 0; bus < 256; bus++) {
3202 ret = copy_context_table(iommu, &old_rt[bus],
3203 ctxt_tbls, bus, ext);
3205 pr_err("%s: Failed to copy context table for bus %d\n",
3211 spin_lock_irqsave(&iommu->lock, flags);
3213 /* Context tables are copied, now write them to the root_entry table */
3214 for (bus = 0; bus < 256; bus++) {
3215 int idx = ext ? bus * 2 : bus;
3218 if (ctxt_tbls[idx]) {
3219 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3220 iommu->root_entry[bus].lo = val;
3223 if (!ext || !ctxt_tbls[idx + 1])
3226 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3227 iommu->root_entry[bus].hi = val;
3230 spin_unlock_irqrestore(&iommu->lock, flags);
3234 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3244 #ifdef CONFIG_INTEL_IOMMU_SVM
3245 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3247 struct intel_iommu *iommu = data;
3251 return INVALID_IOASID;
3253 * VT-d virtual command interface always uses the full 20 bit
3254 * PASID range. Host can partition guest PASID range based on
3255 * policies but it is out of guest's control.
3257 if (min < PASID_MIN || max > intel_pasid_max_id)
3258 return INVALID_IOASID;
3260 if (vcmd_alloc_pasid(iommu, &ioasid))
3261 return INVALID_IOASID;
3266 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3268 struct intel_iommu *iommu = data;
3273 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3274 * We can only free the PASID when all the devices are unbound.
3276 if (ioasid_find(NULL, ioasid, NULL)) {
3277 pr_alert("Cannot free active IOASID %d\n", ioasid);
3280 vcmd_free_pasid(iommu, ioasid);
3283 static void register_pasid_allocator(struct intel_iommu *iommu)
3286 * If we are running in the host, no need for custom allocator
3287 * in that PASIDs are allocated from the host system-wide.
3289 if (!cap_caching_mode(iommu->cap))
3292 if (!sm_supported(iommu)) {
3293 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3298 * Register a custom PASID allocator if we are running in a guest,
3299 * guest PASID must be obtained via virtual command interface.
3300 * There can be multiple vIOMMUs in each guest but only one allocator
3301 * is active. All vIOMMU allocators will eventually be calling the same
3304 if (!vccap_pasid(iommu->vccap))
3307 pr_info("Register custom PASID allocator\n");
3308 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3309 iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3310 iommu->pasid_allocator.pdata = (void *)iommu;
3311 if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3312 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3314 * Disable scalable mode on this IOMMU if there
3315 * is no custom allocator. Mixing SM capable vIOMMU
3316 * and non-SM vIOMMU are not supported.
3323 static int __init init_dmars(void)
3325 struct dmar_drhd_unit *drhd;
3326 struct intel_iommu *iommu;
3332 * initialize and program root entry to not present
3335 for_each_drhd_unit(drhd) {
3337 * lock not needed as this is only incremented in the single
3338 * threaded kernel __init code path all other access are read
3341 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3345 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3348 /* Preallocate enough resources for IOMMU hot-addition */
3349 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3350 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3352 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3355 pr_err("Allocating global iommu array failed\n");
3360 for_each_iommu(iommu, drhd) {
3361 if (drhd->ignored) {
3362 iommu_disable_translation(iommu);
3367 * Find the max pasid size of all IOMMU's in the system.
3368 * We need to ensure the system pasid table is no bigger
3369 * than the smallest supported.
3371 if (pasid_supported(iommu)) {
3372 u32 temp = 2 << ecap_pss(iommu->ecap);
3374 intel_pasid_max_id = min_t(u32, temp,
3375 intel_pasid_max_id);
3378 g_iommus[iommu->seq_id] = iommu;
3380 intel_iommu_init_qi(iommu);
3382 ret = iommu_init_domains(iommu);
3386 init_translation_status(iommu);
3388 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3389 iommu_disable_translation(iommu);
3390 clear_translation_pre_enabled(iommu);
3391 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3397 * we could share the same root & context tables
3398 * among all IOMMU's. Need to Split it later.
3400 ret = iommu_alloc_root_entry(iommu);
3404 if (translation_pre_enabled(iommu)) {
3405 pr_info("Translation already enabled - trying to copy translation structures\n");
3407 ret = copy_translation_tables(iommu);
3410 * We found the IOMMU with translation
3411 * enabled - but failed to copy over the
3412 * old root-entry table. Try to proceed
3413 * by disabling translation now and
3414 * allocating a clean root-entry table.
3415 * This might cause DMAR faults, but
3416 * probably the dump will still succeed.
3418 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3420 iommu_disable_translation(iommu);
3421 clear_translation_pre_enabled(iommu);
3423 pr_info("Copied translation tables from previous kernel for %s\n",
3428 if (!ecap_pass_through(iommu->ecap))
3429 hw_pass_through = 0;
3431 if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) {
3432 pr_warn("Disable batched IOTLB flush due to virtualization");
3433 intel_iommu_strict = 1;
3435 intel_svm_check(iommu);
3439 * Now that qi is enabled on all iommus, set the root entry and flush
3440 * caches. This is required on some Intel X58 chipsets, otherwise the
3441 * flush_context function will loop forever and the boot hangs.
3443 for_each_active_iommu(iommu, drhd) {
3444 iommu_flush_write_buffer(iommu);
3445 #ifdef CONFIG_INTEL_IOMMU_SVM
3446 register_pasid_allocator(iommu);
3448 iommu_set_root_entry(iommu);
3451 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3456 iommu_identity_mapping |= IDENTMAP_GFX;
3458 check_tylersburg_isoch();
3460 ret = si_domain_init(hw_pass_through);
3467 * global invalidate context cache
3468 * global invalidate iotlb
3469 * enable translation
3471 for_each_iommu(iommu, drhd) {
3472 if (drhd->ignored) {
3474 * we always have to disable PMRs or DMA may fail on
3478 iommu_disable_protect_mem_regions(iommu);
3482 iommu_flush_write_buffer(iommu);
3484 #ifdef CONFIG_INTEL_IOMMU_SVM
3485 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3487 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3488 * could cause possible lock race condition.
3490 up_write(&dmar_global_lock);
3491 ret = intel_svm_enable_prq(iommu);
3492 down_write(&dmar_global_lock);
3497 ret = dmar_set_interrupt(iommu);
3505 for_each_active_iommu(iommu, drhd) {
3506 disable_dmar_iommu(iommu);
3507 free_dmar_iommu(iommu);
3510 domain_exit(si_domain);
3520 /* This takes a number of _MM_ pages, not VTD pages */
3521 static unsigned long intel_alloc_iova(struct device *dev,
3522 struct dmar_domain *domain,
3523 unsigned long nrpages, uint64_t dma_mask)
3525 unsigned long iova_pfn;
3528 * Restrict dma_mask to the width that the iommu can handle.
3529 * First-level translation restricts the input-address to a
3530 * canonical address (i.e., address bits 63:N have the same
3531 * value as address bit [N-1], where N is 48-bits with 4-level
3532 * paging and 57-bits with 5-level paging). Hence, skip bit
3535 if (domain_use_first_level(domain))
3536 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3539 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3542 /* Ensure we reserve the whole size-aligned region */
3543 nrpages = __roundup_pow_of_two(nrpages);
3545 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3547 * First try to allocate an io virtual address in
3548 * DMA_BIT_MASK(32) and if that fails then try allocating
3551 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3552 IOVA_PFN(DMA_BIT_MASK(32)), false);
3556 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3557 IOVA_PFN(dma_mask), true);
3558 if (unlikely(!iova_pfn)) {
3559 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3567 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3568 size_t size, int dir, u64 dma_mask)
3570 struct dmar_domain *domain;
3571 phys_addr_t start_paddr;
3572 unsigned long iova_pfn;
3575 struct intel_iommu *iommu;
3576 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3578 BUG_ON(dir == DMA_NONE);
3580 if (unlikely(attach_deferred(dev)))
3581 do_deferred_attach(dev);
3583 domain = find_domain(dev);
3585 return DMA_MAPPING_ERROR;
3587 iommu = domain_get_iommu(domain);
3588 size = aligned_nrpages(paddr, size);
3590 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3595 * Check if DMAR supports zero-length reads on write only
3598 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3599 !cap_zlr(iommu->cap))
3600 prot |= DMA_PTE_READ;
3601 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3602 prot |= DMA_PTE_WRITE;
3604 * paddr - (paddr + size) might be partial page, we should map the whole
3605 * page. Note: if two part of one page are separately mapped, we
3606 * might have two guest_addr mapping to the same host paddr, but this
3607 * is not a big problem
3609 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3610 mm_to_dma_pfn(paddr_pfn), size, prot);
3614 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3615 start_paddr += paddr & ~PAGE_MASK;
3617 trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3623 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3624 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3625 size, (unsigned long long)paddr, dir);
3626 return DMA_MAPPING_ERROR;
3629 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3630 unsigned long offset, size_t size,
3631 enum dma_data_direction dir,
3632 unsigned long attrs)
3634 return __intel_map_single(dev, page_to_phys(page) + offset,
3635 size, dir, *dev->dma_mask);
3638 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3639 size_t size, enum dma_data_direction dir,
3640 unsigned long attrs)
3642 return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3645 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3647 struct dmar_domain *domain;
3648 unsigned long start_pfn, last_pfn;
3649 unsigned long nrpages;
3650 unsigned long iova_pfn;
3651 struct intel_iommu *iommu;
3652 struct page *freelist;
3653 struct pci_dev *pdev = NULL;
3655 domain = find_domain(dev);
3658 iommu = domain_get_iommu(domain);
3660 iova_pfn = IOVA_PFN(dev_addr);
3662 nrpages = aligned_nrpages(dev_addr, size);
3663 start_pfn = mm_to_dma_pfn(iova_pfn);
3664 last_pfn = start_pfn + nrpages - 1;
3666 if (dev_is_pci(dev))
3667 pdev = to_pci_dev(dev);
3669 freelist = domain_unmap(domain, start_pfn, last_pfn);
3670 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3671 !has_iova_flush_queue(&domain->iovad)) {
3672 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3673 nrpages, !freelist, 0);
3675 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3676 dma_free_pagelist(freelist);
3678 queue_iova(&domain->iovad, iova_pfn, nrpages,
3679 (unsigned long)freelist);
3681 * queue up the release of the unmap to save the 1/6th of the
3682 * cpu used up by the iotlb flush operation...
3686 trace_unmap_single(dev, dev_addr, size);
3689 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3690 size_t size, enum dma_data_direction dir,
3691 unsigned long attrs)
3693 intel_unmap(dev, dev_addr, size);
3696 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3697 size_t size, enum dma_data_direction dir, unsigned long attrs)
3699 intel_unmap(dev, dev_addr, size);
3702 static void *intel_alloc_coherent(struct device *dev, size_t size,
3703 dma_addr_t *dma_handle, gfp_t flags,
3704 unsigned long attrs)
3706 struct page *page = NULL;
3709 if (unlikely(attach_deferred(dev)))
3710 do_deferred_attach(dev);
3712 size = PAGE_ALIGN(size);
3713 order = get_order(size);
3715 if (gfpflags_allow_blocking(flags)) {
3716 unsigned int count = size >> PAGE_SHIFT;
3718 page = dma_alloc_from_contiguous(dev, count, order,
3719 flags & __GFP_NOWARN);
3723 page = alloc_pages(flags, order);
3726 memset(page_address(page), 0, size);
3728 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3730 dev->coherent_dma_mask);
3731 if (*dma_handle != DMA_MAPPING_ERROR)
3732 return page_address(page);
3733 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3734 __free_pages(page, order);
3739 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3740 dma_addr_t dma_handle, unsigned long attrs)
3743 struct page *page = virt_to_page(vaddr);
3745 size = PAGE_ALIGN(size);
3746 order = get_order(size);
3748 intel_unmap(dev, dma_handle, size);
3749 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3750 __free_pages(page, order);
3753 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3754 int nelems, enum dma_data_direction dir,
3755 unsigned long attrs)
3757 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3758 unsigned long nrpages = 0;
3759 struct scatterlist *sg;
3762 for_each_sg(sglist, sg, nelems, i) {
3763 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3766 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3768 trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3771 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3772 enum dma_data_direction dir, unsigned long attrs)
3775 struct dmar_domain *domain;
3778 unsigned long iova_pfn;
3780 struct scatterlist *sg;
3781 unsigned long start_vpfn;
3782 struct intel_iommu *iommu;
3784 BUG_ON(dir == DMA_NONE);
3786 if (unlikely(attach_deferred(dev)))
3787 do_deferred_attach(dev);
3789 domain = find_domain(dev);
3793 iommu = domain_get_iommu(domain);
3795 for_each_sg(sglist, sg, nelems, i)
3796 size += aligned_nrpages(sg->offset, sg->length);
3798 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3801 sglist->dma_length = 0;
3806 * Check if DMAR supports zero-length reads on write only
3809 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3810 !cap_zlr(iommu->cap))
3811 prot |= DMA_PTE_READ;
3812 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3813 prot |= DMA_PTE_WRITE;
3815 start_vpfn = mm_to_dma_pfn(iova_pfn);
3817 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3818 if (unlikely(ret)) {
3819 dma_pte_free_pagetable(domain, start_vpfn,
3820 start_vpfn + size - 1,
3821 agaw_to_level(domain->agaw) + 1);
3822 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3826 for_each_sg(sglist, sg, nelems, i)
3827 trace_map_sg(dev, i + 1, nelems, sg);
3832 static u64 intel_get_required_mask(struct device *dev)
3834 return DMA_BIT_MASK(32);
3837 static const struct dma_map_ops intel_dma_ops = {
3838 .alloc = intel_alloc_coherent,
3839 .free = intel_free_coherent,
3840 .map_sg = intel_map_sg,
3841 .unmap_sg = intel_unmap_sg,
3842 .map_page = intel_map_page,
3843 .unmap_page = intel_unmap_page,
3844 .map_resource = intel_map_resource,
3845 .unmap_resource = intel_unmap_resource,
3846 .dma_supported = dma_direct_supported,
3847 .mmap = dma_common_mmap,
3848 .get_sgtable = dma_common_get_sgtable,
3849 .alloc_pages = dma_common_alloc_pages,
3850 .free_pages = dma_common_free_pages,
3851 .get_required_mask = intel_get_required_mask,
3855 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3856 enum dma_data_direction dir, enum dma_sync_target target)
3858 struct dmar_domain *domain;
3859 phys_addr_t tlb_addr;
3861 domain = find_domain(dev);
3862 if (WARN_ON(!domain))
3865 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3866 if (is_swiotlb_buffer(tlb_addr))
3867 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3871 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3872 enum dma_data_direction dir, unsigned long attrs,
3875 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3876 struct dmar_domain *domain;
3877 struct intel_iommu *iommu;
3878 unsigned long iova_pfn;
3879 unsigned long nrpages;
3880 phys_addr_t tlb_addr;
3884 if (unlikely(attach_deferred(dev)))
3885 do_deferred_attach(dev);
3887 domain = find_domain(dev);
3889 if (WARN_ON(dir == DMA_NONE || !domain))
3890 return DMA_MAPPING_ERROR;
3892 iommu = domain_get_iommu(domain);
3893 if (WARN_ON(!iommu))
3894 return DMA_MAPPING_ERROR;
3896 nrpages = aligned_nrpages(0, size);
3897 iova_pfn = intel_alloc_iova(dev, domain,
3898 dma_to_mm_pfn(nrpages), dma_mask);
3900 return DMA_MAPPING_ERROR;
3903 * Check if DMAR supports zero-length reads on write only
3906 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3907 !cap_zlr(iommu->cap))
3908 prot |= DMA_PTE_READ;
3909 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3910 prot |= DMA_PTE_WRITE;
3913 * If both the physical buffer start address and size are
3914 * page aligned, we don't need to use a bounce page.
3916 if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3917 tlb_addr = swiotlb_tbl_map_single(dev, paddr, size,
3918 aligned_size, dir, attrs);
3919 if (tlb_addr == DMA_MAPPING_ERROR) {
3922 /* Cleanup the padding area. */
3923 void *padding_start = phys_to_virt(tlb_addr);
3924 size_t padding_size = aligned_size;
3926 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3927 (dir == DMA_TO_DEVICE ||
3928 dir == DMA_BIDIRECTIONAL)) {
3929 padding_start += size;
3930 padding_size -= size;
3933 memset(padding_start, 0, padding_size);
3939 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3940 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3944 trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3946 return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3949 if (is_swiotlb_buffer(tlb_addr))
3950 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3951 aligned_size, dir, attrs);
3953 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3954 dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3955 size, (unsigned long long)paddr, dir);
3957 return DMA_MAPPING_ERROR;
3961 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3962 enum dma_data_direction dir, unsigned long attrs)
3964 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3965 struct dmar_domain *domain;
3966 phys_addr_t tlb_addr;
3968 domain = find_domain(dev);
3969 if (WARN_ON(!domain))
3972 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3973 if (WARN_ON(!tlb_addr))
3976 intel_unmap(dev, dev_addr, size);
3977 if (is_swiotlb_buffer(tlb_addr))
3978 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3979 aligned_size, dir, attrs);
3981 trace_bounce_unmap_single(dev, dev_addr, size);
3985 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3986 size_t size, enum dma_data_direction dir, unsigned long attrs)
3988 return bounce_map_single(dev, page_to_phys(page) + offset,
3989 size, dir, attrs, *dev->dma_mask);
3993 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3994 enum dma_data_direction dir, unsigned long attrs)
3996 return bounce_map_single(dev, phys_addr, size,
3997 dir, attrs, *dev->dma_mask);
4001 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
4002 enum dma_data_direction dir, unsigned long attrs)
4004 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4008 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
4009 enum dma_data_direction dir, unsigned long attrs)
4011 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4015 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4016 enum dma_data_direction dir, unsigned long attrs)
4018 struct scatterlist *sg;
4021 for_each_sg(sglist, sg, nelems, i)
4022 bounce_unmap_page(dev, sg->dma_address,
4023 sg_dma_len(sg), dir, attrs);
4027 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4028 enum dma_data_direction dir, unsigned long attrs)
4031 struct scatterlist *sg;
4033 for_each_sg(sglist, sg, nelems, i) {
4034 sg->dma_address = bounce_map_page(dev, sg_page(sg),
4035 sg->offset, sg->length,
4037 if (sg->dma_address == DMA_MAPPING_ERROR)
4039 sg_dma_len(sg) = sg->length;
4042 for_each_sg(sglist, sg, nelems, i)
4043 trace_bounce_map_sg(dev, i + 1, nelems, sg);
4048 bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
4053 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
4054 size_t size, enum dma_data_direction dir)
4056 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
4060 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4061 size_t size, enum dma_data_direction dir)
4063 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4067 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4068 int nelems, enum dma_data_direction dir)
4070 struct scatterlist *sg;
4073 for_each_sg(sglist, sg, nelems, i)
4074 bounce_sync_single(dev, sg_dma_address(sg),
4075 sg_dma_len(sg), dir, SYNC_FOR_CPU);
4079 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4080 int nelems, enum dma_data_direction dir)
4082 struct scatterlist *sg;
4085 for_each_sg(sglist, sg, nelems, i)
4086 bounce_sync_single(dev, sg_dma_address(sg),
4087 sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4090 static const struct dma_map_ops bounce_dma_ops = {
4091 .alloc = intel_alloc_coherent,
4092 .free = intel_free_coherent,
4093 .map_sg = bounce_map_sg,
4094 .unmap_sg = bounce_unmap_sg,
4095 .map_page = bounce_map_page,
4096 .unmap_page = bounce_unmap_page,
4097 .sync_single_for_cpu = bounce_sync_single_for_cpu,
4098 .sync_single_for_device = bounce_sync_single_for_device,
4099 .sync_sg_for_cpu = bounce_sync_sg_for_cpu,
4100 .sync_sg_for_device = bounce_sync_sg_for_device,
4101 .map_resource = bounce_map_resource,
4102 .unmap_resource = bounce_unmap_resource,
4103 .alloc_pages = dma_common_alloc_pages,
4104 .free_pages = dma_common_free_pages,
4105 .dma_supported = dma_direct_supported,
4108 static inline int iommu_domain_cache_init(void)
4112 iommu_domain_cache = kmem_cache_create("iommu_domain",
4113 sizeof(struct dmar_domain),
4118 if (!iommu_domain_cache) {
4119 pr_err("Couldn't create iommu_domain cache\n");
4126 static inline int iommu_devinfo_cache_init(void)
4130 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4131 sizeof(struct device_domain_info),
4135 if (!iommu_devinfo_cache) {
4136 pr_err("Couldn't create devinfo cache\n");
4143 static int __init iommu_init_mempool(void)
4146 ret = iova_cache_get();
4150 ret = iommu_domain_cache_init();
4154 ret = iommu_devinfo_cache_init();
4158 kmem_cache_destroy(iommu_domain_cache);
4165 static void __init iommu_exit_mempool(void)
4167 kmem_cache_destroy(iommu_devinfo_cache);
4168 kmem_cache_destroy(iommu_domain_cache);
4172 static void __init init_no_remapping_devices(void)
4174 struct dmar_drhd_unit *drhd;
4178 for_each_drhd_unit(drhd) {
4179 if (!drhd->include_all) {
4180 for_each_active_dev_scope(drhd->devices,
4181 drhd->devices_cnt, i, dev)
4183 /* ignore DMAR unit if no devices exist */
4184 if (i == drhd->devices_cnt)
4189 for_each_active_drhd_unit(drhd) {
4190 if (drhd->include_all)
4193 for_each_active_dev_scope(drhd->devices,
4194 drhd->devices_cnt, i, dev)
4195 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4197 if (i < drhd->devices_cnt)
4200 /* This IOMMU has *only* gfx devices. Either bypass it or
4201 set the gfx_mapped flag, as appropriate */
4202 drhd->gfx_dedicated = 1;
4208 #ifdef CONFIG_SUSPEND
4209 static int init_iommu_hw(void)
4211 struct dmar_drhd_unit *drhd;
4212 struct intel_iommu *iommu = NULL;
4214 for_each_active_iommu(iommu, drhd)
4216 dmar_reenable_qi(iommu);
4218 for_each_iommu(iommu, drhd) {
4219 if (drhd->ignored) {
4221 * we always have to disable PMRs or DMA may fail on
4225 iommu_disable_protect_mem_regions(iommu);
4229 iommu_flush_write_buffer(iommu);
4230 iommu_set_root_entry(iommu);
4231 iommu_enable_translation(iommu);
4232 iommu_disable_protect_mem_regions(iommu);
4238 static void iommu_flush_all(void)
4240 struct dmar_drhd_unit *drhd;
4241 struct intel_iommu *iommu;
4243 for_each_active_iommu(iommu, drhd) {
4244 iommu->flush.flush_context(iommu, 0, 0, 0,
4245 DMA_CCMD_GLOBAL_INVL);
4246 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4247 DMA_TLB_GLOBAL_FLUSH);
4251 static int iommu_suspend(void)
4253 struct dmar_drhd_unit *drhd;
4254 struct intel_iommu *iommu = NULL;
4257 for_each_active_iommu(iommu, drhd) {
4258 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4260 if (!iommu->iommu_state)
4266 for_each_active_iommu(iommu, drhd) {
4267 iommu_disable_translation(iommu);
4269 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4271 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4272 readl(iommu->reg + DMAR_FECTL_REG);
4273 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4274 readl(iommu->reg + DMAR_FEDATA_REG);
4275 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4276 readl(iommu->reg + DMAR_FEADDR_REG);
4277 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4278 readl(iommu->reg + DMAR_FEUADDR_REG);
4280 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4285 for_each_active_iommu(iommu, drhd)
4286 kfree(iommu->iommu_state);
4291 static void iommu_resume(void)
4293 struct dmar_drhd_unit *drhd;
4294 struct intel_iommu *iommu = NULL;
4297 if (init_iommu_hw()) {
4299 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4301 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4305 for_each_active_iommu(iommu, drhd) {
4307 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4309 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4310 iommu->reg + DMAR_FECTL_REG);
4311 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4312 iommu->reg + DMAR_FEDATA_REG);
4313 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4314 iommu->reg + DMAR_FEADDR_REG);
4315 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4316 iommu->reg + DMAR_FEUADDR_REG);
4318 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4321 for_each_active_iommu(iommu, drhd)
4322 kfree(iommu->iommu_state);
4325 static struct syscore_ops iommu_syscore_ops = {
4326 .resume = iommu_resume,
4327 .suspend = iommu_suspend,
4330 static void __init init_iommu_pm_ops(void)
4332 register_syscore_ops(&iommu_syscore_ops);
4336 static inline void init_iommu_pm_ops(void) {}
4337 #endif /* CONFIG_PM */
4339 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4341 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4342 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4343 rmrr->end_address <= rmrr->base_address ||
4344 arch_rmrr_sanity_check(rmrr))
4350 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4352 struct acpi_dmar_reserved_memory *rmrr;
4353 struct dmar_rmrr_unit *rmrru;
4355 rmrr = (struct acpi_dmar_reserved_memory *)header;
4356 if (rmrr_sanity_check(rmrr)) {
4358 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4359 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4360 rmrr->base_address, rmrr->end_address,
4361 dmi_get_system_info(DMI_BIOS_VENDOR),
4362 dmi_get_system_info(DMI_BIOS_VERSION),
4363 dmi_get_system_info(DMI_PRODUCT_VERSION));
4364 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4367 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4371 rmrru->hdr = header;
4373 rmrru->base_address = rmrr->base_address;
4374 rmrru->end_address = rmrr->end_address;
4376 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4377 ((void *)rmrr) + rmrr->header.length,
4378 &rmrru->devices_cnt);
4379 if (rmrru->devices_cnt && rmrru->devices == NULL)
4382 list_add(&rmrru->list, &dmar_rmrr_units);
4391 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4393 struct dmar_atsr_unit *atsru;
4394 struct acpi_dmar_atsr *tmp;
4396 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4398 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4399 if (atsr->segment != tmp->segment)
4401 if (atsr->header.length != tmp->header.length)
4403 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4410 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4412 struct acpi_dmar_atsr *atsr;
4413 struct dmar_atsr_unit *atsru;
4415 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4418 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4419 atsru = dmar_find_atsr(atsr);
4423 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4428 * If memory is allocated from slab by ACPI _DSM method, we need to
4429 * copy the memory content because the memory buffer will be freed
4432 atsru->hdr = (void *)(atsru + 1);
4433 memcpy(atsru->hdr, hdr, hdr->length);
4434 atsru->include_all = atsr->flags & 0x1;
4435 if (!atsru->include_all) {
4436 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4437 (void *)atsr + atsr->header.length,
4438 &atsru->devices_cnt);
4439 if (atsru->devices_cnt && atsru->devices == NULL) {
4445 list_add_rcu(&atsru->list, &dmar_atsr_units);
4450 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4452 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4456 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4458 struct acpi_dmar_atsr *atsr;
4459 struct dmar_atsr_unit *atsru;
4461 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4462 atsru = dmar_find_atsr(atsr);
4464 list_del_rcu(&atsru->list);
4466 intel_iommu_free_atsr(atsru);
4472 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4476 struct acpi_dmar_atsr *atsr;
4477 struct dmar_atsr_unit *atsru;
4479 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4480 atsru = dmar_find_atsr(atsr);
4484 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4485 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4493 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4496 struct intel_iommu *iommu = dmaru->iommu;
4498 if (g_iommus[iommu->seq_id])
4501 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4502 pr_warn("%s: Doesn't support hardware pass through.\n",
4506 if (!ecap_sc_support(iommu->ecap) &&
4507 domain_update_iommu_snooping(iommu)) {
4508 pr_warn("%s: Doesn't support snooping.\n",
4512 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4513 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4514 pr_warn("%s: Doesn't support large page.\n",
4520 * Disable translation if already enabled prior to OS handover.
4522 if (iommu->gcmd & DMA_GCMD_TE)
4523 iommu_disable_translation(iommu);
4525 g_iommus[iommu->seq_id] = iommu;
4526 ret = iommu_init_domains(iommu);
4528 ret = iommu_alloc_root_entry(iommu);
4532 intel_svm_check(iommu);
4534 if (dmaru->ignored) {
4536 * we always have to disable PMRs or DMA may fail on this device
4539 iommu_disable_protect_mem_regions(iommu);
4543 intel_iommu_init_qi(iommu);
4544 iommu_flush_write_buffer(iommu);
4546 #ifdef CONFIG_INTEL_IOMMU_SVM
4547 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4548 ret = intel_svm_enable_prq(iommu);
4553 ret = dmar_set_interrupt(iommu);
4557 iommu_set_root_entry(iommu);
4558 iommu_enable_translation(iommu);
4560 iommu_disable_protect_mem_regions(iommu);
4564 disable_dmar_iommu(iommu);
4566 free_dmar_iommu(iommu);
4570 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4573 struct intel_iommu *iommu = dmaru->iommu;
4575 if (!intel_iommu_enabled)
4581 ret = intel_iommu_add(dmaru);
4583 disable_dmar_iommu(iommu);
4584 free_dmar_iommu(iommu);
4590 static void intel_iommu_free_dmars(void)
4592 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4593 struct dmar_atsr_unit *atsru, *atsr_n;
4595 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4596 list_del(&rmrru->list);
4597 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4601 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4602 list_del(&atsru->list);
4603 intel_iommu_free_atsr(atsru);
4607 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4610 struct pci_bus *bus;
4611 struct pci_dev *bridge = NULL;
4613 struct acpi_dmar_atsr *atsr;
4614 struct dmar_atsr_unit *atsru;
4616 dev = pci_physfn(dev);
4617 for (bus = dev->bus; bus; bus = bus->parent) {
4619 /* If it's an integrated device, allow ATS */
4622 /* Connected via non-PCIe: no ATS */
4623 if (!pci_is_pcie(bridge) ||
4624 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4626 /* If we found the root port, look it up in the ATSR */
4627 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4632 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4633 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4634 if (atsr->segment != pci_domain_nr(dev->bus))
4637 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4638 if (tmp == &bridge->dev)
4641 if (atsru->include_all)
4651 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4654 struct dmar_rmrr_unit *rmrru;
4655 struct dmar_atsr_unit *atsru;
4656 struct acpi_dmar_atsr *atsr;
4657 struct acpi_dmar_reserved_memory *rmrr;
4659 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4662 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4663 rmrr = container_of(rmrru->hdr,
4664 struct acpi_dmar_reserved_memory, header);
4665 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4666 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4667 ((void *)rmrr) + rmrr->header.length,
4668 rmrr->segment, rmrru->devices,
4669 rmrru->devices_cnt);
4672 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4673 dmar_remove_dev_scope(info, rmrr->segment,
4674 rmrru->devices, rmrru->devices_cnt);
4678 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4679 if (atsru->include_all)
4682 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4683 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4684 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4685 (void *)atsr + atsr->header.length,
4686 atsr->segment, atsru->devices,
4687 atsru->devices_cnt);
4692 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4693 if (dmar_remove_dev_scope(info, atsr->segment,
4694 atsru->devices, atsru->devices_cnt))
4702 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4703 unsigned long val, void *v)
4705 struct memory_notify *mhp = v;
4706 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4707 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4711 case MEM_GOING_ONLINE:
4712 if (iommu_domain_identity_map(si_domain,
4713 start_vpfn, last_vpfn)) {
4714 pr_warn("Failed to build identity map for [%lx-%lx]\n",
4715 start_vpfn, last_vpfn);
4721 case MEM_CANCEL_ONLINE:
4723 struct dmar_drhd_unit *drhd;
4724 struct intel_iommu *iommu;
4725 struct page *freelist;
4727 freelist = domain_unmap(si_domain,
4728 start_vpfn, last_vpfn);
4731 for_each_active_iommu(iommu, drhd)
4732 iommu_flush_iotlb_psi(iommu, si_domain,
4733 start_vpfn, mhp->nr_pages,
4736 dma_free_pagelist(freelist);
4744 static struct notifier_block intel_iommu_memory_nb = {
4745 .notifier_call = intel_iommu_memory_notifier,
4749 static void free_all_cpu_cached_iovas(unsigned int cpu)
4753 for (i = 0; i < g_num_of_iommus; i++) {
4754 struct intel_iommu *iommu = g_iommus[i];
4755 struct dmar_domain *domain;
4761 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4762 domain = get_iommu_domain(iommu, (u16)did);
4764 if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4767 free_cpu_cached_iovas(cpu, &domain->iovad);
4772 static int intel_iommu_cpu_dead(unsigned int cpu)
4774 free_all_cpu_cached_iovas(cpu);
4778 static void intel_disable_iommus(void)
4780 struct intel_iommu *iommu = NULL;
4781 struct dmar_drhd_unit *drhd;
4783 for_each_iommu(iommu, drhd)
4784 iommu_disable_translation(iommu);
4787 void intel_iommu_shutdown(void)
4789 struct dmar_drhd_unit *drhd;
4790 struct intel_iommu *iommu = NULL;
4792 if (no_iommu || dmar_disabled)
4795 down_write(&dmar_global_lock);
4797 /* Disable PMRs explicitly here. */
4798 for_each_iommu(iommu, drhd)
4799 iommu_disable_protect_mem_regions(iommu);
4801 /* Make sure the IOMMUs are switched off */
4802 intel_disable_iommus();
4804 up_write(&dmar_global_lock);
4807 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4809 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4811 return container_of(iommu_dev, struct intel_iommu, iommu);
4814 static ssize_t intel_iommu_show_version(struct device *dev,
4815 struct device_attribute *attr,
4818 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4819 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4820 return sprintf(buf, "%d:%d\n",
4821 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4823 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4825 static ssize_t intel_iommu_show_address(struct device *dev,
4826 struct device_attribute *attr,
4829 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4830 return sprintf(buf, "%llx\n", iommu->reg_phys);
4832 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4834 static ssize_t intel_iommu_show_cap(struct device *dev,
4835 struct device_attribute *attr,
4838 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4839 return sprintf(buf, "%llx\n", iommu->cap);
4841 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4843 static ssize_t intel_iommu_show_ecap(struct device *dev,
4844 struct device_attribute *attr,
4847 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4848 return sprintf(buf, "%llx\n", iommu->ecap);
4850 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4852 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4853 struct device_attribute *attr,
4856 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4857 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4859 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4861 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4862 struct device_attribute *attr,
4865 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4866 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4867 cap_ndoms(iommu->cap)));
4869 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4871 static struct attribute *intel_iommu_attrs[] = {
4872 &dev_attr_version.attr,
4873 &dev_attr_address.attr,
4875 &dev_attr_ecap.attr,
4876 &dev_attr_domains_supported.attr,
4877 &dev_attr_domains_used.attr,
4881 static struct attribute_group intel_iommu_group = {
4882 .name = "intel-iommu",
4883 .attrs = intel_iommu_attrs,
4886 const struct attribute_group *intel_iommu_groups[] = {
4891 static inline bool has_external_pci(void)
4893 struct pci_dev *pdev = NULL;
4895 for_each_pci_dev(pdev)
4896 if (pdev->external_facing)
4902 static int __init platform_optin_force_iommu(void)
4904 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4907 if (no_iommu || dmar_disabled)
4908 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4911 * If Intel-IOMMU is disabled by default, we will apply identity
4912 * map for all devices except those marked as being untrusted.
4915 iommu_set_default_passthrough(false);
4923 static int __init probe_acpi_namespace_devices(void)
4925 struct dmar_drhd_unit *drhd;
4926 /* To avoid a -Wunused-but-set-variable warning. */
4927 struct intel_iommu *iommu __maybe_unused;
4931 for_each_active_iommu(iommu, drhd) {
4932 for_each_active_dev_scope(drhd->devices,
4933 drhd->devices_cnt, i, dev) {
4934 struct acpi_device_physical_node *pn;
4935 struct iommu_group *group;
4936 struct acpi_device *adev;
4938 if (dev->bus != &acpi_bus_type)
4941 adev = to_acpi_device(dev);
4942 mutex_lock(&adev->physical_node_lock);
4943 list_for_each_entry(pn,
4944 &adev->physical_node_list, node) {
4945 group = iommu_group_get(pn->dev);
4947 iommu_group_put(group);
4951 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4952 ret = iommu_probe_device(pn->dev);
4956 mutex_unlock(&adev->physical_node_lock);
4966 int __init intel_iommu_init(void)
4969 struct dmar_drhd_unit *drhd;
4970 struct intel_iommu *iommu;
4973 * Intel IOMMU is required for a TXT/tboot launch or platform
4974 * opt in, so enforce that.
4976 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4977 platform_optin_force_iommu();
4979 if (iommu_init_mempool()) {
4981 panic("tboot: Failed to initialize iommu memory\n");
4985 down_write(&dmar_global_lock);
4986 if (dmar_table_init()) {
4988 panic("tboot: Failed to initialize DMAR table\n");
4992 if (dmar_dev_scope_init() < 0) {
4994 panic("tboot: Failed to initialize DMAR device scope\n");
4998 up_write(&dmar_global_lock);
5001 * The bus notifier takes the dmar_global_lock, so lockdep will
5002 * complain later when we register it under the lock.
5004 dmar_register_bus_notifier();
5006 down_write(&dmar_global_lock);
5009 intel_iommu_debugfs_init();
5011 if (no_iommu || dmar_disabled) {
5013 * We exit the function here to ensure IOMMU's remapping and
5014 * mempool aren't setup, which means that the IOMMU's PMRs
5015 * won't be disabled via the call to init_dmars(). So disable
5016 * it explicitly here. The PMRs were setup by tboot prior to
5017 * calling SENTER, but the kernel is expected to reset/tear
5020 if (intel_iommu_tboot_noforce) {
5021 for_each_iommu(iommu, drhd)
5022 iommu_disable_protect_mem_regions(iommu);
5026 * Make sure the IOMMUs are switched off, even when we
5027 * boot into a kexec kernel and the previous kernel left
5030 intel_disable_iommus();
5034 if (list_empty(&dmar_rmrr_units))
5035 pr_info("No RMRR found\n");
5037 if (list_empty(&dmar_atsr_units))
5038 pr_info("No ATSR found\n");
5040 if (dmar_init_reserved_ranges()) {
5042 panic("tboot: Failed to reserve iommu ranges\n");
5043 goto out_free_reserved_range;
5047 intel_iommu_gfx_mapped = 1;
5049 init_no_remapping_devices();
5054 panic("tboot: Failed to initialize DMARs\n");
5055 pr_err("Initialization failed\n");
5056 goto out_free_reserved_range;
5058 up_write(&dmar_global_lock);
5060 init_iommu_pm_ops();
5062 down_read(&dmar_global_lock);
5063 for_each_active_iommu(iommu, drhd) {
5064 iommu_device_sysfs_add(&iommu->iommu, NULL,
5067 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5068 iommu_device_register(&iommu->iommu);
5070 up_read(&dmar_global_lock);
5072 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5073 if (si_domain && !hw_pass_through)
5074 register_memory_notifier(&intel_iommu_memory_nb);
5075 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5076 intel_iommu_cpu_dead);
5078 down_read(&dmar_global_lock);
5079 if (probe_acpi_namespace_devices())
5080 pr_warn("ACPI name space devices didn't probe correctly\n");
5082 /* Finally, we enable the DMA remapping hardware. */
5083 for_each_iommu(iommu, drhd) {
5084 if (!drhd->ignored && !translation_pre_enabled(iommu))
5085 iommu_enable_translation(iommu);
5087 iommu_disable_protect_mem_regions(iommu);
5089 up_read(&dmar_global_lock);
5091 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5093 intel_iommu_enabled = 1;
5097 out_free_reserved_range:
5098 put_iova_domain(&reserved_iova_list);
5100 intel_iommu_free_dmars();
5101 up_write(&dmar_global_lock);
5102 iommu_exit_mempool();
5106 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5108 struct intel_iommu *iommu = opaque;
5110 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5115 * NB - intel-iommu lacks any sort of reference counting for the users of
5116 * dependent devices. If multiple endpoints have intersecting dependent
5117 * devices, unbinding the driver from any one of them will possibly leave
5118 * the others unable to operate.
5120 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5122 if (!iommu || !dev || !dev_is_pci(dev))
5125 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5128 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5130 struct dmar_domain *domain;
5131 struct intel_iommu *iommu;
5132 unsigned long flags;
5134 assert_spin_locked(&device_domain_lock);
5139 iommu = info->iommu;
5140 domain = info->domain;
5143 if (dev_is_pci(info->dev) && sm_supported(iommu))
5144 intel_pasid_tear_down_entry(iommu, info->dev,
5145 PASID_RID2PASID, false);
5147 iommu_disable_dev_iotlb(info);
5148 if (!dev_is_real_dma_subdevice(info->dev))
5149 domain_context_clear(iommu, info->dev);
5150 intel_pasid_free_table(info->dev);
5153 unlink_domain_info(info);
5155 spin_lock_irqsave(&iommu->lock, flags);
5156 domain_detach_iommu(domain, iommu);
5157 spin_unlock_irqrestore(&iommu->lock, flags);
5159 free_devinfo_mem(info);
5162 static void dmar_remove_one_dev_info(struct device *dev)
5164 struct device_domain_info *info;
5165 unsigned long flags;
5167 spin_lock_irqsave(&device_domain_lock, flags);
5168 info = get_domain_info(dev);
5170 __dmar_remove_one_dev_info(info);
5171 spin_unlock_irqrestore(&device_domain_lock, flags);
5174 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5178 /* calculate AGAW */
5179 domain->gaw = guest_width;
5180 adjust_width = guestwidth_to_adjustwidth(guest_width);
5181 domain->agaw = width_to_agaw(adjust_width);
5183 domain->iommu_coherency = 0;
5184 domain->iommu_snooping = 0;
5185 domain->iommu_superpage = 0;
5186 domain->max_addr = 0;
5188 /* always allocate the top pgd */
5189 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5192 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5196 static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
5198 init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5199 copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
5201 if (!intel_iommu_strict &&
5202 init_iova_flush_queue(&dmar_domain->iovad,
5203 iommu_flush_iova, iova_entry_free))
5204 pr_info("iova flush queue initialization failed\n");
5207 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5209 struct dmar_domain *dmar_domain;
5210 struct iommu_domain *domain;
5213 case IOMMU_DOMAIN_DMA:
5214 case IOMMU_DOMAIN_UNMANAGED:
5215 dmar_domain = alloc_domain(0);
5217 pr_err("Can't allocate dmar_domain\n");
5220 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5221 pr_err("Domain initialization failed\n");
5222 domain_exit(dmar_domain);
5226 if (type == IOMMU_DOMAIN_DMA)
5227 intel_init_iova_domain(dmar_domain);
5229 domain = &dmar_domain->domain;
5230 domain->geometry.aperture_start = 0;
5231 domain->geometry.aperture_end =
5232 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5233 domain->geometry.force_aperture = true;
5236 case IOMMU_DOMAIN_IDENTITY:
5237 return &si_domain->domain;
5245 static void intel_iommu_domain_free(struct iommu_domain *domain)
5247 if (domain != &si_domain->domain)
5248 domain_exit(to_dmar_domain(domain));
5252 * Check whether a @domain could be attached to the @dev through the
5253 * aux-domain attach/detach APIs.
5256 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5258 struct device_domain_info *info = get_domain_info(dev);
5260 return info && info->auxd_enabled &&
5261 domain->type == IOMMU_DOMAIN_UNMANAGED;
5264 static void auxiliary_link_device(struct dmar_domain *domain,
5267 struct device_domain_info *info = get_domain_info(dev);
5269 assert_spin_locked(&device_domain_lock);
5273 domain->auxd_refcnt++;
5274 list_add(&domain->auxd, &info->auxiliary_domains);
5277 static void auxiliary_unlink_device(struct dmar_domain *domain,
5280 struct device_domain_info *info = get_domain_info(dev);
5282 assert_spin_locked(&device_domain_lock);
5286 list_del(&domain->auxd);
5287 domain->auxd_refcnt--;
5289 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5290 ioasid_free(domain->default_pasid);
5293 static int aux_domain_add_dev(struct dmar_domain *domain,
5297 unsigned long flags;
5298 struct intel_iommu *iommu;
5300 iommu = device_to_iommu(dev, NULL, NULL);
5304 if (domain->default_pasid <= 0) {
5307 /* No private data needed for the default pasid */
5308 pasid = ioasid_alloc(NULL, PASID_MIN,
5309 pci_max_pasids(to_pci_dev(dev)) - 1,
5311 if (pasid == INVALID_IOASID) {
5312 pr_err("Can't allocate default pasid\n");
5315 domain->default_pasid = pasid;
5318 spin_lock_irqsave(&device_domain_lock, flags);
5320 * iommu->lock must be held to attach domain to iommu and setup the
5321 * pasid entry for second level translation.
5323 spin_lock(&iommu->lock);
5324 ret = domain_attach_iommu(domain, iommu);
5328 /* Setup the PASID entry for mediated devices: */
5329 if (domain_use_first_level(domain))
5330 ret = domain_setup_first_level(iommu, domain, dev,
5331 domain->default_pasid);
5333 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5334 domain->default_pasid);
5337 spin_unlock(&iommu->lock);
5339 auxiliary_link_device(domain, dev);
5341 spin_unlock_irqrestore(&device_domain_lock, flags);
5346 domain_detach_iommu(domain, iommu);
5348 spin_unlock(&iommu->lock);
5349 spin_unlock_irqrestore(&device_domain_lock, flags);
5350 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5351 ioasid_free(domain->default_pasid);
5356 static void aux_domain_remove_dev(struct dmar_domain *domain,
5359 struct device_domain_info *info;
5360 struct intel_iommu *iommu;
5361 unsigned long flags;
5363 if (!is_aux_domain(dev, &domain->domain))
5366 spin_lock_irqsave(&device_domain_lock, flags);
5367 info = get_domain_info(dev);
5368 iommu = info->iommu;
5370 auxiliary_unlink_device(domain, dev);
5372 spin_lock(&iommu->lock);
5373 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5374 domain_detach_iommu(domain, iommu);
5375 spin_unlock(&iommu->lock);
5377 spin_unlock_irqrestore(&device_domain_lock, flags);
5380 static int prepare_domain_attach_device(struct iommu_domain *domain,
5383 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5384 struct intel_iommu *iommu;
5387 iommu = device_to_iommu(dev, NULL, NULL);
5391 /* check if this iommu agaw is sufficient for max mapped address */
5392 addr_width = agaw_to_width(iommu->agaw);
5393 if (addr_width > cap_mgaw(iommu->cap))
5394 addr_width = cap_mgaw(iommu->cap);
5396 if (dmar_domain->max_addr > (1LL << addr_width)) {
5397 dev_err(dev, "%s: iommu width (%d) is not "
5398 "sufficient for the mapped address (%llx)\n",
5399 __func__, addr_width, dmar_domain->max_addr);
5402 dmar_domain->gaw = addr_width;
5405 * Knock out extra levels of page tables if necessary
5407 while (iommu->agaw < dmar_domain->agaw) {
5408 struct dma_pte *pte;
5410 pte = dmar_domain->pgd;
5411 if (dma_pte_present(pte)) {
5412 dmar_domain->pgd = (struct dma_pte *)
5413 phys_to_virt(dma_pte_addr(pte));
5414 free_pgtable_page(pte);
5416 dmar_domain->agaw--;
5422 static int intel_iommu_attach_device(struct iommu_domain *domain,
5427 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5428 device_is_rmrr_locked(dev)) {
5429 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5433 if (is_aux_domain(dev, domain))
5436 /* normally dev is not mapped */
5437 if (unlikely(domain_context_mapped(dev))) {
5438 struct dmar_domain *old_domain;
5440 old_domain = find_domain(dev);
5442 dmar_remove_one_dev_info(dev);
5445 ret = prepare_domain_attach_device(domain, dev);
5449 return domain_add_dev_info(to_dmar_domain(domain), dev);
5452 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5457 if (!is_aux_domain(dev, domain))
5460 ret = prepare_domain_attach_device(domain, dev);
5464 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5467 static void intel_iommu_detach_device(struct iommu_domain *domain,
5470 dmar_remove_one_dev_info(dev);
5473 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5476 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5479 #ifdef CONFIG_INTEL_IOMMU_SVM
5481 * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5482 * VT-d granularity. Invalidation is typically included in the unmap operation
5483 * as a result of DMA or VFIO unmap. However, for assigned devices guest
5484 * owns the first level page tables. Invalidations of translation caches in the
5485 * guest are trapped and passed down to the host.
5487 * vIOMMU in the guest will only expose first level page tables, therefore
5488 * we do not support IOTLB granularity for request without PASID (second level).
5490 * For example, to find the VT-d granularity encoding for IOTLB
5491 * type and page selective granularity within PASID:
5492 * X: indexed by iommu cache type
5493 * Y: indexed by enum iommu_inv_granularity
5494 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5498 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5500 * PASID based IOTLB invalidation: PASID selective (per PASID),
5501 * page selective (address granularity)
5503 {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5504 /* PASID based dev TLBs */
5505 {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5507 {-EINVAL, -EINVAL, -EINVAL}
5510 static inline int to_vtd_granularity(int type, int granu)
5512 return inv_type_granu_table[type][granu];
5515 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5517 u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5519 /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5520 * IOMMU cache invalidate API passes granu_size in bytes, and number of
5521 * granu size in contiguous memory.
5523 return order_base_2(nr_pages);
5527 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5528 struct iommu_cache_invalidate_info *inv_info)
5530 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5531 struct device_domain_info *info;
5532 struct intel_iommu *iommu;
5533 unsigned long flags;
5540 if (!inv_info || !dmar_domain)
5543 if (!dev || !dev_is_pci(dev))
5546 iommu = device_to_iommu(dev, &bus, &devfn);
5550 if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5553 spin_lock_irqsave(&device_domain_lock, flags);
5554 spin_lock(&iommu->lock);
5555 info = get_domain_info(dev);
5560 did = dmar_domain->iommu_did[iommu->seq_id];
5561 sid = PCI_DEVID(bus, devfn);
5563 /* Size is only valid in address selective invalidation */
5564 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
5565 size = to_vtd_size(inv_info->granu.addr_info.granule_size,
5566 inv_info->granu.addr_info.nb_granules);
5568 for_each_set_bit(cache_type,
5569 (unsigned long *)&inv_info->cache,
5570 IOMMU_CACHE_INV_TYPE_NR) {
5575 granu = to_vtd_granularity(cache_type, inv_info->granularity);
5576 if (granu == -EINVAL) {
5577 pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5578 cache_type, inv_info->granularity);
5583 * PASID is stored in different locations based on the
5586 if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5587 (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5588 pasid = inv_info->granu.pasid_info.pasid;
5589 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5590 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5591 pasid = inv_info->granu.addr_info.pasid;
5593 switch (BIT(cache_type)) {
5594 case IOMMU_CACHE_INV_TYPE_IOTLB:
5595 /* HW will ignore LSB bits based on address mask */
5596 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5598 (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5599 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
5600 inv_info->granu.addr_info.addr, size);
5604 * If granu is PASID-selective, address is ignored.
5605 * We use npages = -1 to indicate that.
5607 qi_flush_piotlb(iommu, did, pasid,
5608 mm_to_dma_pfn(inv_info->granu.addr_info.addr),
5609 (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5610 inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5612 if (!info->ats_enabled)
5615 * Always flush device IOTLB if ATS is enabled. vIOMMU
5616 * in the guest may assume IOTLB flush is inclusive,
5617 * which is more efficient.
5620 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5622 * PASID based device TLB invalidation does not support
5623 * IOMMU_INV_GRANU_PASID granularity but only supports
5624 * IOMMU_INV_GRANU_ADDR.
5625 * The equivalent of that is we set the size to be the
5626 * entire range of 64 bit. User only provides PASID info
5627 * without address info. So we set addr to 0.
5629 if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5630 size = 64 - VTD_PAGE_SHIFT;
5632 } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5633 addr = inv_info->granu.addr_info.addr;
5636 if (info->ats_enabled)
5637 qi_flush_dev_iotlb_pasid(iommu, sid,
5639 info->ats_qdep, addr,
5642 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5645 dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5651 spin_unlock(&iommu->lock);
5652 spin_unlock_irqrestore(&device_domain_lock, flags);
5658 static int intel_iommu_map(struct iommu_domain *domain,
5659 unsigned long iova, phys_addr_t hpa,
5660 size_t size, int iommu_prot, gfp_t gfp)
5662 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5667 if (iommu_prot & IOMMU_READ)
5668 prot |= DMA_PTE_READ;
5669 if (iommu_prot & IOMMU_WRITE)
5670 prot |= DMA_PTE_WRITE;
5671 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5672 prot |= DMA_PTE_SNP;
5674 max_addr = iova + size;
5675 if (dmar_domain->max_addr < max_addr) {
5678 /* check if minimum agaw is sufficient for mapped address */
5679 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5680 if (end < max_addr) {
5681 pr_err("%s: iommu width (%d) is not "
5682 "sufficient for the mapped address (%llx)\n",
5683 __func__, dmar_domain->gaw, max_addr);
5686 dmar_domain->max_addr = max_addr;
5688 /* Round up size to next multiple of PAGE_SIZE, if it and
5689 the low bits of hpa would take us onto the next page */
5690 size = aligned_nrpages(hpa, size);
5691 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5692 hpa >> VTD_PAGE_SHIFT, size, prot);
5696 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5697 unsigned long iova, size_t size,
5698 struct iommu_iotlb_gather *gather)
5700 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5701 struct page *freelist = NULL;
5702 unsigned long start_pfn, last_pfn;
5703 unsigned int npages;
5704 int iommu_id, level = 0;
5706 /* Cope with horrid API which requires us to unmap more than the
5707 size argument if it happens to be a large-page mapping. */
5708 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5710 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5711 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5713 start_pfn = iova >> VTD_PAGE_SHIFT;
5714 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5716 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5718 npages = last_pfn - start_pfn + 1;
5720 for_each_domain_iommu(iommu_id, dmar_domain)
5721 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5722 start_pfn, npages, !freelist, 0);
5724 dma_free_pagelist(freelist);
5726 if (dmar_domain->max_addr == iova + size)
5727 dmar_domain->max_addr = iova;
5732 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5735 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5736 struct dma_pte *pte;
5740 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5741 if (pte && dma_pte_present(pte))
5742 phys = dma_pte_addr(pte) +
5743 (iova & (BIT_MASK(level_to_offset_bits(level) +
5744 VTD_PAGE_SHIFT) - 1));
5749 static inline bool scalable_mode_support(void)
5751 struct dmar_drhd_unit *drhd;
5752 struct intel_iommu *iommu;
5756 for_each_active_iommu(iommu, drhd) {
5757 if (!sm_supported(iommu)) {
5767 static inline bool iommu_pasid_support(void)
5769 struct dmar_drhd_unit *drhd;
5770 struct intel_iommu *iommu;
5774 for_each_active_iommu(iommu, drhd) {
5775 if (!pasid_supported(iommu)) {
5785 static inline bool nested_mode_support(void)
5787 struct dmar_drhd_unit *drhd;
5788 struct intel_iommu *iommu;
5792 for_each_active_iommu(iommu, drhd) {
5793 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5803 static bool intel_iommu_capable(enum iommu_cap cap)
5805 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5806 return domain_update_iommu_snooping(NULL) == 1;
5807 if (cap == IOMMU_CAP_INTR_REMAP)
5808 return irq_remapping_enabled == 1;
5813 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5815 struct intel_iommu *iommu;
5817 iommu = device_to_iommu(dev, NULL, NULL);
5819 return ERR_PTR(-ENODEV);
5821 if (translation_pre_enabled(iommu))
5822 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5824 return &iommu->iommu;
5827 static void intel_iommu_release_device(struct device *dev)
5829 struct intel_iommu *iommu;
5831 iommu = device_to_iommu(dev, NULL, NULL);
5835 dmar_remove_one_dev_info(dev);
5837 set_dma_ops(dev, NULL);
5840 static void intel_iommu_probe_finalize(struct device *dev)
5842 struct iommu_domain *domain;
5844 domain = iommu_get_domain_for_dev(dev);
5845 if (device_needs_bounce(dev))
5846 set_dma_ops(dev, &bounce_dma_ops);
5847 else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5848 set_dma_ops(dev, &intel_dma_ops);
5850 set_dma_ops(dev, NULL);
5853 static void intel_iommu_get_resv_regions(struct device *device,
5854 struct list_head *head)
5856 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5857 struct iommu_resv_region *reg;
5858 struct dmar_rmrr_unit *rmrr;
5859 struct device *i_dev;
5862 down_read(&dmar_global_lock);
5863 for_each_rmrr_units(rmrr) {
5864 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5866 struct iommu_resv_region *resv;
5867 enum iommu_resv_type type;
5870 if (i_dev != device &&
5871 !is_downstream_to_pci_bridge(device, i_dev))
5874 length = rmrr->end_address - rmrr->base_address + 1;
5876 type = device_rmrr_is_relaxable(device) ?
5877 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5879 resv = iommu_alloc_resv_region(rmrr->base_address,
5880 length, prot, type);
5884 list_add_tail(&resv->list, head);
5887 up_read(&dmar_global_lock);
5889 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5890 if (dev_is_pci(device)) {
5891 struct pci_dev *pdev = to_pci_dev(device);
5893 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5894 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5895 IOMMU_RESV_DIRECT_RELAXABLE);
5897 list_add_tail(®->list, head);
5900 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5902 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5903 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5907 list_add_tail(®->list, head);
5910 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5912 struct device_domain_info *info;
5913 struct context_entry *context;
5914 struct dmar_domain *domain;
5915 unsigned long flags;
5919 domain = find_domain(dev);
5923 spin_lock_irqsave(&device_domain_lock, flags);
5924 spin_lock(&iommu->lock);
5927 info = get_domain_info(dev);
5928 if (!info || !info->pasid_supported)
5931 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5932 if (WARN_ON(!context))
5935 ctx_lo = context[0].lo;
5937 if (!(ctx_lo & CONTEXT_PASIDE)) {
5938 ctx_lo |= CONTEXT_PASIDE;
5939 context[0].lo = ctx_lo;
5941 iommu->flush.flush_context(iommu,
5942 domain->iommu_did[iommu->seq_id],
5943 PCI_DEVID(info->bus, info->devfn),
5944 DMA_CCMD_MASK_NOBIT,
5945 DMA_CCMD_DEVICE_INVL);
5948 /* Enable PASID support in the device, if it wasn't already */
5949 if (!info->pasid_enabled)
5950 iommu_enable_dev_iotlb(info);
5955 spin_unlock(&iommu->lock);
5956 spin_unlock_irqrestore(&device_domain_lock, flags);
5961 static void intel_iommu_apply_resv_region(struct device *dev,
5962 struct iommu_domain *domain,
5963 struct iommu_resv_region *region)
5965 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5966 unsigned long start, end;
5968 start = IOVA_PFN(region->start);
5969 end = IOVA_PFN(region->start + region->length - 1);
5971 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5974 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5976 if (dev_is_pci(dev))
5977 return pci_device_group(dev);
5978 return generic_device_group(dev);
5981 static int intel_iommu_enable_auxd(struct device *dev)
5983 struct device_domain_info *info;
5984 struct intel_iommu *iommu;
5985 unsigned long flags;
5988 iommu = device_to_iommu(dev, NULL, NULL);
5989 if (!iommu || dmar_disabled)
5992 if (!sm_supported(iommu) || !pasid_supported(iommu))
5995 ret = intel_iommu_enable_pasid(iommu, dev);
5999 spin_lock_irqsave(&device_domain_lock, flags);
6000 info = get_domain_info(dev);
6001 info->auxd_enabled = 1;
6002 spin_unlock_irqrestore(&device_domain_lock, flags);
6007 static int intel_iommu_disable_auxd(struct device *dev)
6009 struct device_domain_info *info;
6010 unsigned long flags;
6012 spin_lock_irqsave(&device_domain_lock, flags);
6013 info = get_domain_info(dev);
6014 if (!WARN_ON(!info))
6015 info->auxd_enabled = 0;
6016 spin_unlock_irqrestore(&device_domain_lock, flags);
6022 * A PCI express designated vendor specific extended capability is defined
6023 * in the section 3.7 of Intel scalable I/O virtualization technical spec
6024 * for system software and tools to detect endpoint devices supporting the
6025 * Intel scalable IO virtualization without host driver dependency.
6027 * Returns the address of the matching extended capability structure within
6028 * the device's PCI configuration space or 0 if the device does not support
6031 static int siov_find_pci_dvsec(struct pci_dev *pdev)
6036 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
6038 pci_read_config_word(pdev, pos + 4, &vendor);
6039 pci_read_config_word(pdev, pos + 8, &id);
6040 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
6043 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
6050 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
6052 if (feat == IOMMU_DEV_FEAT_AUX) {
6055 if (!dev_is_pci(dev) || dmar_disabled ||
6056 !scalable_mode_support() || !iommu_pasid_support())
6059 ret = pci_pasid_features(to_pci_dev(dev));
6063 return !!siov_find_pci_dvsec(to_pci_dev(dev));
6066 if (feat == IOMMU_DEV_FEAT_SVA) {
6067 struct device_domain_info *info = get_domain_info(dev);
6069 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
6070 info->pasid_supported && info->pri_supported &&
6071 info->ats_supported;
6078 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
6080 if (feat == IOMMU_DEV_FEAT_AUX)
6081 return intel_iommu_enable_auxd(dev);
6083 if (feat == IOMMU_DEV_FEAT_SVA) {
6084 struct device_domain_info *info = get_domain_info(dev);
6089 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
6097 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
6099 if (feat == IOMMU_DEV_FEAT_AUX)
6100 return intel_iommu_disable_auxd(dev);
6106 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
6108 struct device_domain_info *info = get_domain_info(dev);
6110 if (feat == IOMMU_DEV_FEAT_AUX)
6111 return scalable_mode_support() && info && info->auxd_enabled;
6117 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
6119 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6121 return dmar_domain->default_pasid > 0 ?
6122 dmar_domain->default_pasid : -EINVAL;
6125 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6128 return attach_deferred(dev);
6132 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6133 enum iommu_attr attr, void *data)
6135 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6136 unsigned long flags;
6139 if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6143 case DOMAIN_ATTR_NESTING:
6144 spin_lock_irqsave(&device_domain_lock, flags);
6145 if (nested_mode_support() &&
6146 list_empty(&dmar_domain->devices)) {
6147 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6148 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6152 spin_unlock_irqrestore(&device_domain_lock, flags);
6163 * Check that the device does not live on an external facing PCI port that is
6164 * marked as untrusted. Such devices should not be able to apply quirks and
6165 * thus not be able to bypass the IOMMU restrictions.
6167 static bool risky_device(struct pci_dev *pdev)
6169 if (pdev->untrusted) {
6171 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
6172 pdev->vendor, pdev->device);
6173 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
6179 const struct iommu_ops intel_iommu_ops = {
6180 .capable = intel_iommu_capable,
6181 .domain_alloc = intel_iommu_domain_alloc,
6182 .domain_free = intel_iommu_domain_free,
6183 .domain_set_attr = intel_iommu_domain_set_attr,
6184 .attach_dev = intel_iommu_attach_device,
6185 .detach_dev = intel_iommu_detach_device,
6186 .aux_attach_dev = intel_iommu_aux_attach_device,
6187 .aux_detach_dev = intel_iommu_aux_detach_device,
6188 .aux_get_pasid = intel_iommu_aux_get_pasid,
6189 .map = intel_iommu_map,
6190 .unmap = intel_iommu_unmap,
6191 .iova_to_phys = intel_iommu_iova_to_phys,
6192 .probe_device = intel_iommu_probe_device,
6193 .probe_finalize = intel_iommu_probe_finalize,
6194 .release_device = intel_iommu_release_device,
6195 .get_resv_regions = intel_iommu_get_resv_regions,
6196 .put_resv_regions = generic_iommu_put_resv_regions,
6197 .apply_resv_region = intel_iommu_apply_resv_region,
6198 .device_group = intel_iommu_device_group,
6199 .dev_has_feat = intel_iommu_dev_has_feat,
6200 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
6201 .dev_enable_feat = intel_iommu_dev_enable_feat,
6202 .dev_disable_feat = intel_iommu_dev_disable_feat,
6203 .is_attach_deferred = intel_iommu_is_attach_deferred,
6204 .def_domain_type = device_def_domain_type,
6205 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
6206 #ifdef CONFIG_INTEL_IOMMU_SVM
6207 .cache_invalidate = intel_iommu_sva_invalidate,
6208 .sva_bind_gpasid = intel_svm_bind_gpasid,
6209 .sva_unbind_gpasid = intel_svm_unbind_gpasid,
6210 .sva_bind = intel_svm_bind,
6211 .sva_unbind = intel_svm_unbind,
6212 .sva_get_pasid = intel_svm_get_pasid,
6213 .page_response = intel_svm_page_response,
6217 static void quirk_iommu_igfx(struct pci_dev *dev)
6219 if (risky_device(dev))
6222 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6226 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6227 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6228 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6229 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6230 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6231 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6232 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6233 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6235 /* Broadwell igfx malfunctions with dmar */
6236 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6237 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6238 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6239 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6240 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6241 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6242 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6243 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6244 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6245 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6246 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6247 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6248 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6249 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6250 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6251 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6252 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6253 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6254 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6255 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6256 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6257 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6258 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6259 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6261 static void quirk_iommu_rwbf(struct pci_dev *dev)
6263 if (risky_device(dev))
6267 * Mobile 4 Series Chipset neglects to set RWBF capability,
6268 * but needs it. Same seems to hold for the desktop versions.
6270 pci_info(dev, "Forcing write-buffer flush capability\n");
6274 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6275 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6276 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6277 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6278 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6279 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6280 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6283 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
6284 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
6285 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
6286 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
6287 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
6288 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
6289 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
6290 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
6292 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6296 if (risky_device(dev))
6299 if (pci_read_config_word(dev, GGC, &ggc))
6302 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6303 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6305 } else if (dmar_map_gfx) {
6306 /* we have to ensure the gfx device is idle before we flush */
6307 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6308 intel_iommu_strict = 1;
6311 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6312 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6313 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6314 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6316 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
6320 if (!IS_GFX_DEVICE(dev))
6323 ver = (dev->device >> 8) & 0xff;
6324 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
6325 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
6326 ver != 0x9a && ver != 0xa7)
6329 if (risky_device(dev))
6332 pci_info(dev, "Skip IOMMU disabling for graphics\n");
6333 iommu_skip_te_disable = 1;
6335 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
6337 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6338 ISOCH DMAR unit for the Azalia sound device, but not give it any
6339 TLB entries, which causes it to deadlock. Check for that. We do
6340 this in a function called from init_dmars(), instead of in a PCI
6341 quirk, because we don't want to print the obnoxious "BIOS broken"
6342 message if VT-d is actually disabled.
6344 static void __init check_tylersburg_isoch(void)
6346 struct pci_dev *pdev;
6347 uint32_t vtisochctrl;
6349 /* If there's no Azalia in the system anyway, forget it. */
6350 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6354 if (risky_device(pdev)) {
6361 /* System Management Registers. Might be hidden, in which case
6362 we can't do the sanity check. But that's OK, because the
6363 known-broken BIOSes _don't_ actually hide it, so far. */
6364 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6368 if (risky_device(pdev)) {
6373 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6380 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6381 if (vtisochctrl & 1)
6384 /* Drop all bits other than the number of TLB entries */
6385 vtisochctrl &= 0x1c;
6387 /* If we have the recommended number of TLB entries (16), fine. */
6388 if (vtisochctrl == 0x10)
6391 /* Zero TLB entries? You get to ride the short bus to school. */
6393 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6394 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6395 dmi_get_system_info(DMI_BIOS_VENDOR),
6396 dmi_get_system_info(DMI_BIOS_VERSION),
6397 dmi_get_system_info(DMI_PRODUCT_VERSION));
6398 iommu_identity_mapping |= IDENTMAP_AZALIA;
6402 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",