GNU Linux-libre 5.10.153-gnu1
[releases.git] / drivers / iommu / intel / iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-map-ops.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-map-ops.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49
50 #include "../irq_remapping.h"
51 #include "pasid.h"
52
53 #define ROOT_SIZE               VTD_PAGE_SIZE
54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START      (0xfee00000)
62 #define IOAPIC_RANGE_END        (0xfeefffff)
63 #define IOVA_START_ADDR         (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN          (1)
81
82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
83
84 /* page table handling */
85 #define LEVEL_STRIDE            (9)
86 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
87
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
105
106 static inline int agaw_to_level(int agaw)
107 {
108         return agaw + 2;
109 }
110
111 static inline int agaw_to_width(int agaw)
112 {
113         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115
116 static inline int width_to_agaw(int width)
117 {
118         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123         return (level - 1) * LEVEL_STRIDE;
124 }
125
126 static inline int pfn_level_offset(u64 pfn, int level)
127 {
128         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130
131 static inline u64 level_mask(int level)
132 {
133         return -1ULL << level_to_offset_bits(level);
134 }
135
136 static inline u64 level_size(int level)
137 {
138         return 1ULL << level_to_offset_bits(level);
139 }
140
141 static inline u64 align_to_level(u64 pfn, int level)
142 {
143         return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148         return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164         return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168         return page_to_dma_pfn(virt_to_page(p));
169 }
170
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 static int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193         if (!(re->lo & 1))
194                 return 0;
195
196         return re->lo & VTD_PAGE_MASK;
197 }
198
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205         if (!(re->hi & 1))
206                 return 0;
207
208         return re->hi & VTD_PAGE_MASK;
209 }
210
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213         context->lo &= ~(1ULL << 11);
214 }
215
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218         return !!(context->lo & (1ULL << 11));
219 }
220
221 static inline void context_set_copied(struct context_entry *context)
222 {
223         context->hi |= (1ull << 3);
224 }
225
226 static inline bool context_copied(struct context_entry *context)
227 {
228         return !!(context->hi & (1ULL << 3));
229 }
230
231 static inline bool __context_present(struct context_entry *context)
232 {
233         return (context->lo & 1);
234 }
235
236 bool context_present(struct context_entry *context)
237 {
238         return context_pasid_enabled(context) ?
239              __context_present(context) :
240              __context_present(context) && !context_copied(context);
241 }
242
243 static inline void context_set_present(struct context_entry *context)
244 {
245         context->lo |= 1;
246 }
247
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250         context->lo &= (((u64)-1) << 2) | 1;
251 }
252
253 static inline void context_set_translation_type(struct context_entry *context,
254                                                 unsigned long value)
255 {
256         context->lo &= (((u64)-1) << 4) | 3;
257         context->lo |= (value & 3) << 2;
258 }
259
260 static inline void context_set_address_root(struct context_entry *context,
261                                             unsigned long value)
262 {
263         context->lo &= ~VTD_PAGE_MASK;
264         context->lo |= value & VTD_PAGE_MASK;
265 }
266
267 static inline void context_set_address_width(struct context_entry *context,
268                                              unsigned long value)
269 {
270         context->hi |= value & 7;
271 }
272
273 static inline void context_set_domain_id(struct context_entry *context,
274                                          unsigned long value)
275 {
276         context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278
279 static inline int context_domain_id(struct context_entry *c)
280 {
281         return((c->hi >> 8) & 0xffff);
282 }
283
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286         context->lo = 0;
287         context->hi = 0;
288 }
289
290 /*
291  * This domain is a statically identity mapping domain.
292  *      1. This domain creats a static 1:1 mapping to all usable memory.
293  *      2. It maps to each iommu if successful.
294  *      3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298
299 #define for_each_domain_iommu(idx, domain)                      \
300         for (idx = 0; idx < g_num_of_iommus; idx++)             \
301                 if (domain->iommu_refcnt[idx])
302
303 struct dmar_rmrr_unit {
304         struct list_head list;          /* list of rmrr units   */
305         struct acpi_dmar_header *hdr;   /* ACPI header          */
306         u64     base_address;           /* reserved base address*/
307         u64     end_address;            /* reserved end address */
308         struct dmar_dev_scope *devices; /* target devices */
309         int     devices_cnt;            /* target device count */
310 };
311
312 struct dmar_atsr_unit {
313         struct list_head list;          /* list of ATSR units */
314         struct acpi_dmar_header *hdr;   /* ACPI header */
315         struct dmar_dev_scope *devices; /* target devices */
316         int devices_cnt;                /* target device count */
317         u8 include_all:1;               /* include all ports */
318 };
319
320 static LIST_HEAD(dmar_atsr_units);
321 static LIST_HEAD(dmar_rmrr_units);
322
323 #define for_each_rmrr_units(rmrr) \
324         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
325
326 /* bitmap for indexing intel_iommus */
327 static int g_num_of_iommus;
328
329 static void domain_exit(struct dmar_domain *domain);
330 static void domain_remove_dev_info(struct dmar_domain *domain);
331 static void dmar_remove_one_dev_info(struct device *dev);
332 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333 static int intel_iommu_attach_device(struct iommu_domain *domain,
334                                      struct device *dev);
335 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
336                                             dma_addr_t iova);
337
338 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339 int dmar_disabled = 0;
340 #else
341 int dmar_disabled = 1;
342 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
343
344 #ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345 int intel_iommu_sm = 1;
346 #else
347 int intel_iommu_sm;
348 #endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
349
350 int intel_iommu_enabled = 0;
351 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
352
353 static int dmar_map_gfx = 1;
354 static int dmar_forcedac;
355 static int intel_iommu_strict;
356 static int intel_iommu_superpage = 1;
357 static int iommu_identity_mapping;
358 static int intel_no_bounce;
359 static int iommu_skip_te_disable;
360
361 #define IDENTMAP_GFX            2
362 #define IDENTMAP_AZALIA         4
363
364 int intel_iommu_gfx_mapped;
365 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
366
367 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
368 struct device_domain_info *get_domain_info(struct device *dev)
369 {
370         struct device_domain_info *info;
371
372         if (!dev)
373                 return NULL;
374
375         info = dev_iommu_priv_get(dev);
376         if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
377                 return NULL;
378
379         return info;
380 }
381
382 DEFINE_SPINLOCK(device_domain_lock);
383 static LIST_HEAD(device_domain_list);
384
385 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&    \
386                                 to_pci_dev(d)->untrusted)
387
388 /*
389  * Iterate over elements in device_domain_list and call the specified
390  * callback @fn against each element.
391  */
392 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
393                                      void *data), void *data)
394 {
395         int ret = 0;
396         unsigned long flags;
397         struct device_domain_info *info;
398
399         spin_lock_irqsave(&device_domain_lock, flags);
400         list_for_each_entry(info, &device_domain_list, global) {
401                 ret = fn(info, data);
402                 if (ret) {
403                         spin_unlock_irqrestore(&device_domain_lock, flags);
404                         return ret;
405                 }
406         }
407         spin_unlock_irqrestore(&device_domain_lock, flags);
408
409         return 0;
410 }
411
412 const struct iommu_ops intel_iommu_ops;
413
414 static bool translation_pre_enabled(struct intel_iommu *iommu)
415 {
416         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
417 }
418
419 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
420 {
421         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
422 }
423
424 static void init_translation_status(struct intel_iommu *iommu)
425 {
426         u32 gsts;
427
428         gsts = readl(iommu->reg + DMAR_GSTS_REG);
429         if (gsts & DMA_GSTS_TES)
430                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
431 }
432
433 static int __init intel_iommu_setup(char *str)
434 {
435         if (!str)
436                 return -EINVAL;
437         while (*str) {
438                 if (!strncmp(str, "on", 2)) {
439                         dmar_disabled = 0;
440                         pr_info("IOMMU enabled\n");
441                 } else if (!strncmp(str, "off", 3)) {
442                         dmar_disabled = 1;
443                         no_platform_optin = 1;
444                         pr_info("IOMMU disabled\n");
445                 } else if (!strncmp(str, "igfx_off", 8)) {
446                         dmar_map_gfx = 0;
447                         pr_info("Disable GFX device mapping\n");
448                 } else if (!strncmp(str, "forcedac", 8)) {
449                         pr_info("Forcing DAC for PCI devices\n");
450                         dmar_forcedac = 1;
451                 } else if (!strncmp(str, "strict", 6)) {
452                         pr_info("Disable batched IOTLB flush\n");
453                         intel_iommu_strict = 1;
454                 } else if (!strncmp(str, "sp_off", 6)) {
455                         pr_info("Disable supported super page\n");
456                         intel_iommu_superpage = 0;
457                 } else if (!strncmp(str, "sm_on", 5)) {
458                         pr_info("Intel-IOMMU: scalable mode supported\n");
459                         intel_iommu_sm = 1;
460                 } else if (!strncmp(str, "tboot_noforce", 13)) {
461                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
462                         intel_iommu_tboot_noforce = 1;
463                 } else if (!strncmp(str, "nobounce", 8)) {
464                         pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
465                         intel_no_bounce = 1;
466                 }
467
468                 str += strcspn(str, ",");
469                 while (*str == ',')
470                         str++;
471         }
472         return 0;
473 }
474 __setup("intel_iommu=", intel_iommu_setup);
475
476 static struct kmem_cache *iommu_domain_cache;
477 static struct kmem_cache *iommu_devinfo_cache;
478
479 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
480 {
481         struct dmar_domain **domains;
482         int idx = did >> 8;
483
484         domains = iommu->domains[idx];
485         if (!domains)
486                 return NULL;
487
488         return domains[did & 0xff];
489 }
490
491 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
492                              struct dmar_domain *domain)
493 {
494         struct dmar_domain **domains;
495         int idx = did >> 8;
496
497         if (!iommu->domains[idx]) {
498                 size_t size = 256 * sizeof(struct dmar_domain *);
499                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
500         }
501
502         domains = iommu->domains[idx];
503         if (WARN_ON(!domains))
504                 return;
505         else
506                 domains[did & 0xff] = domain;
507 }
508
509 void *alloc_pgtable_page(int node)
510 {
511         struct page *page;
512         void *vaddr = NULL;
513
514         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
515         if (page)
516                 vaddr = page_address(page);
517         return vaddr;
518 }
519
520 void free_pgtable_page(void *vaddr)
521 {
522         free_page((unsigned long)vaddr);
523 }
524
525 static inline void *alloc_domain_mem(void)
526 {
527         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
528 }
529
530 static void free_domain_mem(void *vaddr)
531 {
532         kmem_cache_free(iommu_domain_cache, vaddr);
533 }
534
535 static inline void * alloc_devinfo_mem(void)
536 {
537         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
538 }
539
540 static inline void free_devinfo_mem(void *vaddr)
541 {
542         kmem_cache_free(iommu_devinfo_cache, vaddr);
543 }
544
545 static inline int domain_type_is_si(struct dmar_domain *domain)
546 {
547         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
548 }
549
550 static inline bool domain_use_first_level(struct dmar_domain *domain)
551 {
552         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
553 }
554
555 static inline int domain_pfn_supported(struct dmar_domain *domain,
556                                        unsigned long pfn)
557 {
558         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
559
560         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
561 }
562
563 /*
564  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
565  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
566  * the returned SAGAW.
567  */
568 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
569 {
570         unsigned long fl_sagaw, sl_sagaw;
571
572         fl_sagaw = BIT(2) | (cap_5lp_support(iommu->cap) ? BIT(3) : 0);
573         sl_sagaw = cap_sagaw(iommu->cap);
574
575         /* Second level only. */
576         if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
577                 return sl_sagaw;
578
579         /* First level only. */
580         if (!ecap_slts(iommu->ecap))
581                 return fl_sagaw;
582
583         return fl_sagaw & sl_sagaw;
584 }
585
586 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
587 {
588         unsigned long sagaw;
589         int agaw = -1;
590
591         sagaw = __iommu_calculate_sagaw(iommu);
592         for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
593                 if (test_bit(agaw, &sagaw))
594                         break;
595         }
596
597         return agaw;
598 }
599
600 /*
601  * Calculate max SAGAW for each iommu.
602  */
603 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
604 {
605         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
606 }
607
608 /*
609  * calculate agaw for each iommu.
610  * "SAGAW" may be different across iommus, use a default agaw, and
611  * get a supported less agaw for iommus that don't support the default agaw.
612  */
613 int iommu_calculate_agaw(struct intel_iommu *iommu)
614 {
615         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
616 }
617
618 /* This functionin only returns single iommu in a domain */
619 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
620 {
621         int iommu_id;
622
623         /* si_domain and vm domain should not get here. */
624         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
625                 return NULL;
626
627         for_each_domain_iommu(iommu_id, domain)
628                 break;
629
630         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
631                 return NULL;
632
633         return g_iommus[iommu_id];
634 }
635
636 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
637 {
638         return sm_supported(iommu) ?
639                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
640 }
641
642 static void domain_update_iommu_coherency(struct dmar_domain *domain)
643 {
644         struct dmar_drhd_unit *drhd;
645         struct intel_iommu *iommu;
646         bool found = false;
647         int i;
648
649         domain->iommu_coherency = 1;
650
651         for_each_domain_iommu(i, domain) {
652                 found = true;
653                 if (!iommu_paging_structure_coherency(g_iommus[i])) {
654                         domain->iommu_coherency = 0;
655                         break;
656                 }
657         }
658         if (found)
659                 return;
660
661         /* No hardware attached; use lowest common denominator */
662         rcu_read_lock();
663         for_each_active_iommu(iommu, drhd) {
664                 if (!iommu_paging_structure_coherency(iommu)) {
665                         domain->iommu_coherency = 0;
666                         break;
667                 }
668         }
669         rcu_read_unlock();
670 }
671
672 static int domain_update_iommu_snooping(struct intel_iommu *skip)
673 {
674         struct dmar_drhd_unit *drhd;
675         struct intel_iommu *iommu;
676         int ret = 1;
677
678         rcu_read_lock();
679         for_each_active_iommu(iommu, drhd) {
680                 if (iommu != skip) {
681                         /*
682                          * If the hardware is operating in the scalable mode,
683                          * the snooping control is always supported since we
684                          * always set PASID-table-entry.PGSNP bit if the domain
685                          * is managed outside (UNMANAGED).
686                          */
687                         if (!sm_supported(iommu) &&
688                             !ecap_sc_support(iommu->ecap)) {
689                                 ret = 0;
690                                 break;
691                         }
692                 }
693         }
694         rcu_read_unlock();
695
696         return ret;
697 }
698
699 static int domain_update_iommu_superpage(struct dmar_domain *domain,
700                                          struct intel_iommu *skip)
701 {
702         struct dmar_drhd_unit *drhd;
703         struct intel_iommu *iommu;
704         int mask = 0x3;
705
706         if (!intel_iommu_superpage) {
707                 return 0;
708         }
709
710         /* set iommu_superpage to the smallest common denominator */
711         rcu_read_lock();
712         for_each_active_iommu(iommu, drhd) {
713                 if (iommu != skip) {
714                         if (domain && domain_use_first_level(domain)) {
715                                 if (!cap_fl1gp_support(iommu->cap))
716                                         mask = 0x1;
717                         } else {
718                                 mask &= cap_super_page_val(iommu->cap);
719                         }
720
721                         if (!mask)
722                                 break;
723                 }
724         }
725         rcu_read_unlock();
726
727         return fls(mask);
728 }
729
730 static int domain_update_device_node(struct dmar_domain *domain)
731 {
732         struct device_domain_info *info;
733         int nid = NUMA_NO_NODE;
734
735         assert_spin_locked(&device_domain_lock);
736
737         if (list_empty(&domain->devices))
738                 return NUMA_NO_NODE;
739
740         list_for_each_entry(info, &domain->devices, link) {
741                 if (!info->dev)
742                         continue;
743
744                 /*
745                  * There could possibly be multiple device numa nodes as devices
746                  * within the same domain may sit behind different IOMMUs. There
747                  * isn't perfect answer in such situation, so we select first
748                  * come first served policy.
749                  */
750                 nid = dev_to_node(info->dev);
751                 if (nid != NUMA_NO_NODE)
752                         break;
753         }
754
755         return nid;
756 }
757
758 /* Some capabilities may be different across iommus */
759 static void domain_update_iommu_cap(struct dmar_domain *domain)
760 {
761         domain_update_iommu_coherency(domain);
762         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
763         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
764
765         /*
766          * If RHSA is missing, we should default to the device numa domain
767          * as fall back.
768          */
769         if (domain->nid == NUMA_NO_NODE)
770                 domain->nid = domain_update_device_node(domain);
771
772         /*
773          * First-level translation restricts the input-address to a
774          * canonical address (i.e., address bits 63:N have the same
775          * value as address bit [N-1], where N is 48-bits with 4-level
776          * paging and 57-bits with 5-level paging). Hence, skip bit
777          * [N-1].
778          */
779         if (domain_use_first_level(domain))
780                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
781         else
782                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
783 }
784
785 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
786                                          u8 devfn, int alloc)
787 {
788         struct root_entry *root = &iommu->root_entry[bus];
789         struct context_entry *context;
790         u64 *entry;
791
792         entry = &root->lo;
793         if (sm_supported(iommu)) {
794                 if (devfn >= 0x80) {
795                         devfn -= 0x80;
796                         entry = &root->hi;
797                 }
798                 devfn *= 2;
799         }
800         if (*entry & 1)
801                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
802         else {
803                 unsigned long phy_addr;
804                 if (!alloc)
805                         return NULL;
806
807                 context = alloc_pgtable_page(iommu->node);
808                 if (!context)
809                         return NULL;
810
811                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
812                 phy_addr = virt_to_phys((void *)context);
813                 *entry = phy_addr | 1;
814                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
815         }
816         return &context[devfn];
817 }
818
819 static bool attach_deferred(struct device *dev)
820 {
821         return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
822 }
823
824 /**
825  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
826  *                               sub-hierarchy of a candidate PCI-PCI bridge
827  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
828  * @bridge: the candidate PCI-PCI bridge
829  *
830  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
831  */
832 static bool
833 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
834 {
835         struct pci_dev *pdev, *pbridge;
836
837         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
838                 return false;
839
840         pdev = to_pci_dev(dev);
841         pbridge = to_pci_dev(bridge);
842
843         if (pbridge->subordinate &&
844             pbridge->subordinate->number <= pdev->bus->number &&
845             pbridge->subordinate->busn_res.end >= pdev->bus->number)
846                 return true;
847
848         return false;
849 }
850
851 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
852 {
853         struct dmar_drhd_unit *drhd;
854         u32 vtbar;
855         int rc;
856
857         /* We know that this device on this chipset has its own IOMMU.
858          * If we find it under a different IOMMU, then the BIOS is lying
859          * to us. Hope that the IOMMU for this device is actually
860          * disabled, and it needs no translation...
861          */
862         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
863         if (rc) {
864                 /* "can't" happen */
865                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
866                 return false;
867         }
868         vtbar &= 0xffff0000;
869
870         /* we know that the this iommu should be at offset 0xa000 from vtbar */
871         drhd = dmar_find_matched_drhd_unit(pdev);
872         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
873                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
874                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
875                 return true;
876         }
877
878         return false;
879 }
880
881 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
882 {
883         if (!iommu || iommu->drhd->ignored)
884                 return true;
885
886         if (dev_is_pci(dev)) {
887                 struct pci_dev *pdev = to_pci_dev(dev);
888
889                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
890                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
891                     quirk_ioat_snb_local_iommu(pdev))
892                         return true;
893         }
894
895         return false;
896 }
897
898 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
899 {
900         struct dmar_drhd_unit *drhd = NULL;
901         struct pci_dev *pdev = NULL;
902         struct intel_iommu *iommu;
903         struct device *tmp;
904         u16 segment = 0;
905         int i;
906
907         if (!dev)
908                 return NULL;
909
910         if (dev_is_pci(dev)) {
911                 struct pci_dev *pf_pdev;
912
913                 pdev = pci_real_dma_dev(to_pci_dev(dev));
914
915                 /* VFs aren't listed in scope tables; we need to look up
916                  * the PF instead to find the IOMMU. */
917                 pf_pdev = pci_physfn(pdev);
918                 dev = &pf_pdev->dev;
919                 segment = pci_domain_nr(pdev->bus);
920         } else if (has_acpi_companion(dev))
921                 dev = &ACPI_COMPANION(dev)->dev;
922
923         rcu_read_lock();
924         for_each_iommu(iommu, drhd) {
925                 if (pdev && segment != drhd->segment)
926                         continue;
927
928                 for_each_active_dev_scope(drhd->devices,
929                                           drhd->devices_cnt, i, tmp) {
930                         if (tmp == dev) {
931                                 /* For a VF use its original BDF# not that of the PF
932                                  * which we used for the IOMMU lookup. Strictly speaking
933                                  * we could do this for all PCI devices; we only need to
934                                  * get the BDF# from the scope table for ACPI matches. */
935                                 if (pdev && pdev->is_virtfn)
936                                         goto got_pdev;
937
938                                 if (bus && devfn) {
939                                         *bus = drhd->devices[i].bus;
940                                         *devfn = drhd->devices[i].devfn;
941                                 }
942                                 goto out;
943                         }
944
945                         if (is_downstream_to_pci_bridge(dev, tmp))
946                                 goto got_pdev;
947                 }
948
949                 if (pdev && drhd->include_all) {
950                 got_pdev:
951                         if (bus && devfn) {
952                                 *bus = pdev->bus->number;
953                                 *devfn = pdev->devfn;
954                         }
955                         goto out;
956                 }
957         }
958         iommu = NULL;
959  out:
960         if (iommu_is_dummy(iommu, dev))
961                 iommu = NULL;
962
963         rcu_read_unlock();
964
965         return iommu;
966 }
967
968 static void domain_flush_cache(struct dmar_domain *domain,
969                                void *addr, int size)
970 {
971         if (!domain->iommu_coherency)
972                 clflush_cache_range(addr, size);
973 }
974
975 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
976 {
977         struct context_entry *context;
978         int ret = 0;
979         unsigned long flags;
980
981         spin_lock_irqsave(&iommu->lock, flags);
982         context = iommu_context_addr(iommu, bus, devfn, 0);
983         if (context)
984                 ret = context_present(context);
985         spin_unlock_irqrestore(&iommu->lock, flags);
986         return ret;
987 }
988
989 static void free_context_table(struct intel_iommu *iommu)
990 {
991         int i;
992         unsigned long flags;
993         struct context_entry *context;
994
995         spin_lock_irqsave(&iommu->lock, flags);
996         if (!iommu->root_entry) {
997                 goto out;
998         }
999         for (i = 0; i < ROOT_ENTRY_NR; i++) {
1000                 context = iommu_context_addr(iommu, i, 0, 0);
1001                 if (context)
1002                         free_pgtable_page(context);
1003
1004                 if (!sm_supported(iommu))
1005                         continue;
1006
1007                 context = iommu_context_addr(iommu, i, 0x80, 0);
1008                 if (context)
1009                         free_pgtable_page(context);
1010
1011         }
1012         free_pgtable_page(iommu->root_entry);
1013         iommu->root_entry = NULL;
1014 out:
1015         spin_unlock_irqrestore(&iommu->lock, flags);
1016 }
1017
1018 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1019                                       unsigned long pfn, int *target_level)
1020 {
1021         struct dma_pte *parent, *pte;
1022         int level = agaw_to_level(domain->agaw);
1023         int offset;
1024
1025         BUG_ON(!domain->pgd);
1026
1027         if (!domain_pfn_supported(domain, pfn))
1028                 /* Address beyond IOMMU's addressing capabilities. */
1029                 return NULL;
1030
1031         parent = domain->pgd;
1032
1033         while (1) {
1034                 void *tmp_page;
1035
1036                 offset = pfn_level_offset(pfn, level);
1037                 pte = &parent[offset];
1038                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1039                         break;
1040                 if (level == *target_level)
1041                         break;
1042
1043                 if (!dma_pte_present(pte)) {
1044                         uint64_t pteval;
1045
1046                         tmp_page = alloc_pgtable_page(domain->nid);
1047
1048                         if (!tmp_page)
1049                                 return NULL;
1050
1051                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1052                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1053                         if (domain_use_first_level(domain)) {
1054                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1055                                 if (domain->domain.type == IOMMU_DOMAIN_DMA)
1056                                         pteval |= DMA_FL_PTE_ACCESS;
1057                         }
1058                         if (cmpxchg64(&pte->val, 0ULL, pteval))
1059                                 /* Someone else set it while we were thinking; use theirs. */
1060                                 free_pgtable_page(tmp_page);
1061                         else
1062                                 domain_flush_cache(domain, pte, sizeof(*pte));
1063                 }
1064                 if (level == 1)
1065                         break;
1066
1067                 parent = phys_to_virt(dma_pte_addr(pte));
1068                 level--;
1069         }
1070
1071         if (!*target_level)
1072                 *target_level = level;
1073
1074         return pte;
1075 }
1076
1077 /* return address's pte at specific level */
1078 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1079                                          unsigned long pfn,
1080                                          int level, int *large_page)
1081 {
1082         struct dma_pte *parent, *pte;
1083         int total = agaw_to_level(domain->agaw);
1084         int offset;
1085
1086         parent = domain->pgd;
1087         while (level <= total) {
1088                 offset = pfn_level_offset(pfn, total);
1089                 pte = &parent[offset];
1090                 if (level == total)
1091                         return pte;
1092
1093                 if (!dma_pte_present(pte)) {
1094                         *large_page = total;
1095                         break;
1096                 }
1097
1098                 if (dma_pte_superpage(pte)) {
1099                         *large_page = total;
1100                         return pte;
1101                 }
1102
1103                 parent = phys_to_virt(dma_pte_addr(pte));
1104                 total--;
1105         }
1106         return NULL;
1107 }
1108
1109 /* clear last level pte, a tlb flush should be followed */
1110 static void dma_pte_clear_range(struct dmar_domain *domain,
1111                                 unsigned long start_pfn,
1112                                 unsigned long last_pfn)
1113 {
1114         unsigned int large_page;
1115         struct dma_pte *first_pte, *pte;
1116
1117         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1118         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1119         BUG_ON(start_pfn > last_pfn);
1120
1121         /* we don't need lock here; nobody else touches the iova range */
1122         do {
1123                 large_page = 1;
1124                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1125                 if (!pte) {
1126                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1127                         continue;
1128                 }
1129                 do {
1130                         dma_clear_pte(pte);
1131                         start_pfn += lvl_to_nr_pages(large_page);
1132                         pte++;
1133                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1134
1135                 domain_flush_cache(domain, first_pte,
1136                                    (void *)pte - (void *)first_pte);
1137
1138         } while (start_pfn && start_pfn <= last_pfn);
1139 }
1140
1141 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1142                                int retain_level, struct dma_pte *pte,
1143                                unsigned long pfn, unsigned long start_pfn,
1144                                unsigned long last_pfn)
1145 {
1146         pfn = max(start_pfn, pfn);
1147         pte = &pte[pfn_level_offset(pfn, level)];
1148
1149         do {
1150                 unsigned long level_pfn;
1151                 struct dma_pte *level_pte;
1152
1153                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1154                         goto next;
1155
1156                 level_pfn = pfn & level_mask(level);
1157                 level_pte = phys_to_virt(dma_pte_addr(pte));
1158
1159                 if (level > 2) {
1160                         dma_pte_free_level(domain, level - 1, retain_level,
1161                                            level_pte, level_pfn, start_pfn,
1162                                            last_pfn);
1163                 }
1164
1165                 /*
1166                  * Free the page table if we're below the level we want to
1167                  * retain and the range covers the entire table.
1168                  */
1169                 if (level < retain_level && !(start_pfn > level_pfn ||
1170                       last_pfn < level_pfn + level_size(level) - 1)) {
1171                         dma_clear_pte(pte);
1172                         domain_flush_cache(domain, pte, sizeof(*pte));
1173                         free_pgtable_page(level_pte);
1174                 }
1175 next:
1176                 pfn += level_size(level);
1177         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1178 }
1179
1180 /*
1181  * clear last level (leaf) ptes and free page table pages below the
1182  * level we wish to keep intact.
1183  */
1184 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1185                                    unsigned long start_pfn,
1186                                    unsigned long last_pfn,
1187                                    int retain_level)
1188 {
1189         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1190         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1191         BUG_ON(start_pfn > last_pfn);
1192
1193         dma_pte_clear_range(domain, start_pfn, last_pfn);
1194
1195         /* We don't need lock here; nobody else touches the iova range */
1196         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1197                            domain->pgd, 0, start_pfn, last_pfn);
1198
1199         /* free pgd */
1200         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1201                 free_pgtable_page(domain->pgd);
1202                 domain->pgd = NULL;
1203         }
1204 }
1205
1206 /* When a page at a given level is being unlinked from its parent, we don't
1207    need to *modify* it at all. All we need to do is make a list of all the
1208    pages which can be freed just as soon as we've flushed the IOTLB and we
1209    know the hardware page-walk will no longer touch them.
1210    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1211    be freed. */
1212 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1213                                             int level, struct dma_pte *pte,
1214                                             struct page *freelist)
1215 {
1216         struct page *pg;
1217
1218         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1219         pg->freelist = freelist;
1220         freelist = pg;
1221
1222         if (level == 1)
1223                 return freelist;
1224
1225         pte = page_address(pg);
1226         do {
1227                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1228                         freelist = dma_pte_list_pagetables(domain, level - 1,
1229                                                            pte, freelist);
1230                 pte++;
1231         } while (!first_pte_in_page(pte));
1232
1233         return freelist;
1234 }
1235
1236 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1237                                         struct dma_pte *pte, unsigned long pfn,
1238                                         unsigned long start_pfn,
1239                                         unsigned long last_pfn,
1240                                         struct page *freelist)
1241 {
1242         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1243
1244         pfn = max(start_pfn, pfn);
1245         pte = &pte[pfn_level_offset(pfn, level)];
1246
1247         do {
1248                 unsigned long level_pfn;
1249
1250                 if (!dma_pte_present(pte))
1251                         goto next;
1252
1253                 level_pfn = pfn & level_mask(level);
1254
1255                 /* If range covers entire pagetable, free it */
1256                 if (start_pfn <= level_pfn &&
1257                     last_pfn >= level_pfn + level_size(level) - 1) {
1258                         /* These suborbinate page tables are going away entirely. Don't
1259                            bother to clear them; we're just going to *free* them. */
1260                         if (level > 1 && !dma_pte_superpage(pte))
1261                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1262
1263                         dma_clear_pte(pte);
1264                         if (!first_pte)
1265                                 first_pte = pte;
1266                         last_pte = pte;
1267                 } else if (level > 1) {
1268                         /* Recurse down into a level that isn't *entirely* obsolete */
1269                         freelist = dma_pte_clear_level(domain, level - 1,
1270                                                        phys_to_virt(dma_pte_addr(pte)),
1271                                                        level_pfn, start_pfn, last_pfn,
1272                                                        freelist);
1273                 }
1274 next:
1275                 pfn += level_size(level);
1276         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1277
1278         if (first_pte)
1279                 domain_flush_cache(domain, first_pte,
1280                                    (void *)++last_pte - (void *)first_pte);
1281
1282         return freelist;
1283 }
1284
1285 /* We can't just free the pages because the IOMMU may still be walking
1286    the page tables, and may have cached the intermediate levels. The
1287    pages can only be freed after the IOTLB flush has been done. */
1288 static struct page *domain_unmap(struct dmar_domain *domain,
1289                                  unsigned long start_pfn,
1290                                  unsigned long last_pfn)
1291 {
1292         struct page *freelist;
1293
1294         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1295         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1296         BUG_ON(start_pfn > last_pfn);
1297
1298         /* we don't need lock here; nobody else touches the iova range */
1299         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1300                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1301
1302         /* free pgd */
1303         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1304                 struct page *pgd_page = virt_to_page(domain->pgd);
1305                 pgd_page->freelist = freelist;
1306                 freelist = pgd_page;
1307
1308                 domain->pgd = NULL;
1309         }
1310
1311         return freelist;
1312 }
1313
1314 static void dma_free_pagelist(struct page *freelist)
1315 {
1316         struct page *pg;
1317
1318         while ((pg = freelist)) {
1319                 freelist = pg->freelist;
1320                 free_pgtable_page(page_address(pg));
1321         }
1322 }
1323
1324 static void iova_entry_free(unsigned long data)
1325 {
1326         struct page *freelist = (struct page *)data;
1327
1328         dma_free_pagelist(freelist);
1329 }
1330
1331 /* iommu handling */
1332 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1333 {
1334         struct root_entry *root;
1335         unsigned long flags;
1336
1337         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1338         if (!root) {
1339                 pr_err("Allocating root entry for %s failed\n",
1340                         iommu->name);
1341                 return -ENOMEM;
1342         }
1343
1344         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1345
1346         spin_lock_irqsave(&iommu->lock, flags);
1347         iommu->root_entry = root;
1348         spin_unlock_irqrestore(&iommu->lock, flags);
1349
1350         return 0;
1351 }
1352
1353 static void iommu_set_root_entry(struct intel_iommu *iommu)
1354 {
1355         u64 addr;
1356         u32 sts;
1357         unsigned long flag;
1358
1359         addr = virt_to_phys(iommu->root_entry);
1360         if (sm_supported(iommu))
1361                 addr |= DMA_RTADDR_SMT;
1362
1363         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1364         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1365
1366         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1367
1368         /* Make sure hardware complete it */
1369         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1370                       readl, (sts & DMA_GSTS_RTPS), sts);
1371
1372         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1373
1374         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1375         if (sm_supported(iommu))
1376                 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1377         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1378 }
1379
1380 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1381 {
1382         u32 val;
1383         unsigned long flag;
1384
1385         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1386                 return;
1387
1388         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1389         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1390
1391         /* Make sure hardware complete it */
1392         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1393                       readl, (!(val & DMA_GSTS_WBFS)), val);
1394
1395         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1396 }
1397
1398 /* return value determine if we need a write buffer flush */
1399 static void __iommu_flush_context(struct intel_iommu *iommu,
1400                                   u16 did, u16 source_id, u8 function_mask,
1401                                   u64 type)
1402 {
1403         u64 val = 0;
1404         unsigned long flag;
1405
1406         switch (type) {
1407         case DMA_CCMD_GLOBAL_INVL:
1408                 val = DMA_CCMD_GLOBAL_INVL;
1409                 break;
1410         case DMA_CCMD_DOMAIN_INVL:
1411                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1412                 break;
1413         case DMA_CCMD_DEVICE_INVL:
1414                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1415                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1416                 break;
1417         default:
1418                 BUG();
1419         }
1420         val |= DMA_CCMD_ICC;
1421
1422         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1423         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1424
1425         /* Make sure hardware complete it */
1426         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1427                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1428
1429         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1430 }
1431
1432 /* return value determine if we need a write buffer flush */
1433 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1434                                 u64 addr, unsigned int size_order, u64 type)
1435 {
1436         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1437         u64 val = 0, val_iva = 0;
1438         unsigned long flag;
1439
1440         switch (type) {
1441         case DMA_TLB_GLOBAL_FLUSH:
1442                 /* global flush doesn't need set IVA_REG */
1443                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1444                 break;
1445         case DMA_TLB_DSI_FLUSH:
1446                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1447                 break;
1448         case DMA_TLB_PSI_FLUSH:
1449                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1450                 /* IH bit is passed in as part of address */
1451                 val_iva = size_order | addr;
1452                 break;
1453         default:
1454                 BUG();
1455         }
1456         /* Note: set drain read/write */
1457 #if 0
1458         /*
1459          * This is probably to be super secure.. Looks like we can
1460          * ignore it without any impact.
1461          */
1462         if (cap_read_drain(iommu->cap))
1463                 val |= DMA_TLB_READ_DRAIN;
1464 #endif
1465         if (cap_write_drain(iommu->cap))
1466                 val |= DMA_TLB_WRITE_DRAIN;
1467
1468         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1469         /* Note: Only uses first TLB reg currently */
1470         if (val_iva)
1471                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1472         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1473
1474         /* Make sure hardware complete it */
1475         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1476                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1477
1478         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1479
1480         /* check IOTLB invalidation granularity */
1481         if (DMA_TLB_IAIG(val) == 0)
1482                 pr_err("Flush IOTLB failed\n");
1483         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1484                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1485                         (unsigned long long)DMA_TLB_IIRG(type),
1486                         (unsigned long long)DMA_TLB_IAIG(val));
1487 }
1488
1489 static struct device_domain_info *
1490 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1491                          u8 bus, u8 devfn)
1492 {
1493         struct device_domain_info *info;
1494
1495         assert_spin_locked(&device_domain_lock);
1496
1497         if (!iommu->qi)
1498                 return NULL;
1499
1500         list_for_each_entry(info, &domain->devices, link)
1501                 if (info->iommu == iommu && info->bus == bus &&
1502                     info->devfn == devfn) {
1503                         if (info->ats_supported && info->dev)
1504                                 return info;
1505                         break;
1506                 }
1507
1508         return NULL;
1509 }
1510
1511 static void domain_update_iotlb(struct dmar_domain *domain)
1512 {
1513         struct device_domain_info *info;
1514         bool has_iotlb_device = false;
1515
1516         assert_spin_locked(&device_domain_lock);
1517
1518         list_for_each_entry(info, &domain->devices, link) {
1519                 struct pci_dev *pdev;
1520
1521                 if (!info->dev || !dev_is_pci(info->dev))
1522                         continue;
1523
1524                 pdev = to_pci_dev(info->dev);
1525                 if (pdev->ats_enabled) {
1526                         has_iotlb_device = true;
1527                         break;
1528                 }
1529         }
1530
1531         domain->has_iotlb_device = has_iotlb_device;
1532 }
1533
1534 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1535 {
1536         struct pci_dev *pdev;
1537
1538         assert_spin_locked(&device_domain_lock);
1539
1540         if (!info || !dev_is_pci(info->dev))
1541                 return;
1542
1543         pdev = to_pci_dev(info->dev);
1544         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1545          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1546          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1547          * reserved, which should be set to 0.
1548          */
1549         if (!ecap_dit(info->iommu->ecap))
1550                 info->pfsid = 0;
1551         else {
1552                 struct pci_dev *pf_pdev;
1553
1554                 /* pdev will be returned if device is not a vf */
1555                 pf_pdev = pci_physfn(pdev);
1556                 info->pfsid = pci_dev_id(pf_pdev);
1557         }
1558
1559 #ifdef CONFIG_INTEL_IOMMU_SVM
1560         /* The PCIe spec, in its wisdom, declares that the behaviour of
1561            the device if you enable PASID support after ATS support is
1562            undefined. So always enable PASID support on devices which
1563            have it, even if we can't yet know if we're ever going to
1564            use it. */
1565         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1566                 info->pasid_enabled = 1;
1567
1568         if (info->pri_supported &&
1569             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1570             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1571                 info->pri_enabled = 1;
1572 #endif
1573         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1574             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1575                 info->ats_enabled = 1;
1576                 domain_update_iotlb(info->domain);
1577                 info->ats_qdep = pci_ats_queue_depth(pdev);
1578         }
1579 }
1580
1581 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1582 {
1583         struct pci_dev *pdev;
1584
1585         assert_spin_locked(&device_domain_lock);
1586
1587         if (!dev_is_pci(info->dev))
1588                 return;
1589
1590         pdev = to_pci_dev(info->dev);
1591
1592         if (info->ats_enabled) {
1593                 pci_disable_ats(pdev);
1594                 info->ats_enabled = 0;
1595                 domain_update_iotlb(info->domain);
1596         }
1597 #ifdef CONFIG_INTEL_IOMMU_SVM
1598         if (info->pri_enabled) {
1599                 pci_disable_pri(pdev);
1600                 info->pri_enabled = 0;
1601         }
1602         if (info->pasid_enabled) {
1603                 pci_disable_pasid(pdev);
1604                 info->pasid_enabled = 0;
1605         }
1606 #endif
1607 }
1608
1609 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1610                                   u64 addr, unsigned mask)
1611 {
1612         u16 sid, qdep;
1613         unsigned long flags;
1614         struct device_domain_info *info;
1615
1616         if (!domain->has_iotlb_device)
1617                 return;
1618
1619         spin_lock_irqsave(&device_domain_lock, flags);
1620         list_for_each_entry(info, &domain->devices, link) {
1621                 if (!info->ats_enabled)
1622                         continue;
1623
1624                 sid = info->bus << 8 | info->devfn;
1625                 qdep = info->ats_qdep;
1626                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1627                                 qdep, addr, mask);
1628         }
1629         spin_unlock_irqrestore(&device_domain_lock, flags);
1630 }
1631
1632 static void domain_flush_piotlb(struct intel_iommu *iommu,
1633                                 struct dmar_domain *domain,
1634                                 u64 addr, unsigned long npages, bool ih)
1635 {
1636         u16 did = domain->iommu_did[iommu->seq_id];
1637
1638         if (domain->default_pasid)
1639                 qi_flush_piotlb(iommu, did, domain->default_pasid,
1640                                 addr, npages, ih);
1641
1642         if (!list_empty(&domain->devices))
1643                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1644 }
1645
1646 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1647                                   struct dmar_domain *domain,
1648                                   unsigned long pfn, unsigned int pages,
1649                                   int ih, int map)
1650 {
1651         unsigned int aligned_pages = __roundup_pow_of_two(pages);
1652         unsigned int mask = ilog2(aligned_pages);
1653         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1654         u16 did = domain->iommu_did[iommu->seq_id];
1655
1656         BUG_ON(pages == 0);
1657
1658         if (ih)
1659                 ih = 1 << 6;
1660
1661         if (domain_use_first_level(domain)) {
1662                 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1663         } else {
1664                 unsigned long bitmask = aligned_pages - 1;
1665
1666                 /*
1667                  * PSI masks the low order bits of the base address. If the
1668                  * address isn't aligned to the mask, then compute a mask value
1669                  * needed to ensure the target range is flushed.
1670                  */
1671                 if (unlikely(bitmask & pfn)) {
1672                         unsigned long end_pfn = pfn + pages - 1, shared_bits;
1673
1674                         /*
1675                          * Since end_pfn <= pfn + bitmask, the only way bits
1676                          * higher than bitmask can differ in pfn and end_pfn is
1677                          * by carrying. This means after masking out bitmask,
1678                          * high bits starting with the first set bit in
1679                          * shared_bits are all equal in both pfn and end_pfn.
1680                          */
1681                         shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1682                         mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1683                 }
1684
1685                 /*
1686                  * Fallback to domain selective flush if no PSI support or
1687                  * the size is too big.
1688                  */
1689                 if (!cap_pgsel_inv(iommu->cap) ||
1690                     mask > cap_max_amask_val(iommu->cap))
1691                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1692                                                         DMA_TLB_DSI_FLUSH);
1693                 else
1694                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1695                                                         DMA_TLB_PSI_FLUSH);
1696         }
1697
1698         /*
1699          * In caching mode, changes of pages from non-present to present require
1700          * flush. However, device IOTLB doesn't need to be flushed in this case.
1701          */
1702         if (!cap_caching_mode(iommu->cap) || !map)
1703                 iommu_flush_dev_iotlb(domain, addr, mask);
1704 }
1705
1706 /* Notification for newly created mappings */
1707 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1708                                         struct dmar_domain *domain,
1709                                         unsigned long pfn, unsigned int pages)
1710 {
1711         /*
1712          * It's a non-present to present mapping. Only flush if caching mode
1713          * and second level.
1714          */
1715         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1716                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1717         else
1718                 iommu_flush_write_buffer(iommu);
1719 }
1720
1721 static void iommu_flush_iova(struct iova_domain *iovad)
1722 {
1723         struct dmar_domain *domain;
1724         int idx;
1725
1726         domain = container_of(iovad, struct dmar_domain, iovad);
1727
1728         for_each_domain_iommu(idx, domain) {
1729                 struct intel_iommu *iommu = g_iommus[idx];
1730                 u16 did = domain->iommu_did[iommu->seq_id];
1731
1732                 if (domain_use_first_level(domain))
1733                         domain_flush_piotlb(iommu, domain, 0, -1, 0);
1734                 else
1735                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1736                                                  DMA_TLB_DSI_FLUSH);
1737
1738                 if (!cap_caching_mode(iommu->cap))
1739                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1740                                               0, MAX_AGAW_PFN_WIDTH);
1741         }
1742 }
1743
1744 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1745 {
1746         u32 pmen;
1747         unsigned long flags;
1748
1749         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1750                 return;
1751
1752         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1753         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1754         pmen &= ~DMA_PMEN_EPM;
1755         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1756
1757         /* wait for the protected region status bit to clear */
1758         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1759                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1760
1761         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1762 }
1763
1764 static void iommu_enable_translation(struct intel_iommu *iommu)
1765 {
1766         u32 sts;
1767         unsigned long flags;
1768
1769         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1770         iommu->gcmd |= DMA_GCMD_TE;
1771         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1772
1773         /* Make sure hardware complete it */
1774         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1775                       readl, (sts & DMA_GSTS_TES), sts);
1776
1777         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1778 }
1779
1780 static void iommu_disable_translation(struct intel_iommu *iommu)
1781 {
1782         u32 sts;
1783         unsigned long flag;
1784
1785         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1786             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1787                 return;
1788
1789         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1790         iommu->gcmd &= ~DMA_GCMD_TE;
1791         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1792
1793         /* Make sure hardware complete it */
1794         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1795                       readl, (!(sts & DMA_GSTS_TES)), sts);
1796
1797         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1798 }
1799
1800 static int iommu_init_domains(struct intel_iommu *iommu)
1801 {
1802         u32 ndomains, nlongs;
1803         size_t size;
1804
1805         ndomains = cap_ndoms(iommu->cap);
1806         pr_debug("%s: Number of Domains supported <%d>\n",
1807                  iommu->name, ndomains);
1808         nlongs = BITS_TO_LONGS(ndomains);
1809
1810         spin_lock_init(&iommu->lock);
1811
1812         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1813         if (!iommu->domain_ids) {
1814                 pr_err("%s: Allocating domain id array failed\n",
1815                        iommu->name);
1816                 return -ENOMEM;
1817         }
1818
1819         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1820         iommu->domains = kzalloc(size, GFP_KERNEL);
1821
1822         if (iommu->domains) {
1823                 size = 256 * sizeof(struct dmar_domain *);
1824                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1825         }
1826
1827         if (!iommu->domains || !iommu->domains[0]) {
1828                 pr_err("%s: Allocating domain array failed\n",
1829                        iommu->name);
1830                 kfree(iommu->domain_ids);
1831                 kfree(iommu->domains);
1832                 iommu->domain_ids = NULL;
1833                 iommu->domains    = NULL;
1834                 return -ENOMEM;
1835         }
1836
1837         /*
1838          * If Caching mode is set, then invalid translations are tagged
1839          * with domain-id 0, hence we need to pre-allocate it. We also
1840          * use domain-id 0 as a marker for non-allocated domain-id, so
1841          * make sure it is not used for a real domain.
1842          */
1843         set_bit(0, iommu->domain_ids);
1844
1845         /*
1846          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1847          * entry for first-level or pass-through translation modes should
1848          * be programmed with a domain id different from those used for
1849          * second-level or nested translation. We reserve a domain id for
1850          * this purpose.
1851          */
1852         if (sm_supported(iommu))
1853                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1854
1855         return 0;
1856 }
1857
1858 static void disable_dmar_iommu(struct intel_iommu *iommu)
1859 {
1860         struct device_domain_info *info, *tmp;
1861         unsigned long flags;
1862
1863         if (!iommu->domains || !iommu->domain_ids)
1864                 return;
1865
1866         spin_lock_irqsave(&device_domain_lock, flags);
1867         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1868                 if (info->iommu != iommu)
1869                         continue;
1870
1871                 if (!info->dev || !info->domain)
1872                         continue;
1873
1874                 __dmar_remove_one_dev_info(info);
1875         }
1876         spin_unlock_irqrestore(&device_domain_lock, flags);
1877
1878         if (iommu->gcmd & DMA_GCMD_TE)
1879                 iommu_disable_translation(iommu);
1880 }
1881
1882 static void free_dmar_iommu(struct intel_iommu *iommu)
1883 {
1884         if ((iommu->domains) && (iommu->domain_ids)) {
1885                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1886                 int i;
1887
1888                 for (i = 0; i < elems; i++)
1889                         kfree(iommu->domains[i]);
1890                 kfree(iommu->domains);
1891                 kfree(iommu->domain_ids);
1892                 iommu->domains = NULL;
1893                 iommu->domain_ids = NULL;
1894         }
1895
1896         g_iommus[iommu->seq_id] = NULL;
1897
1898         /* free context mapping */
1899         free_context_table(iommu);
1900
1901 #ifdef CONFIG_INTEL_IOMMU_SVM
1902         if (pasid_supported(iommu)) {
1903                 if (ecap_prs(iommu->ecap))
1904                         intel_svm_finish_prq(iommu);
1905         }
1906         if (vccap_pasid(iommu->vccap))
1907                 ioasid_unregister_allocator(&iommu->pasid_allocator);
1908
1909 #endif
1910 }
1911
1912 /*
1913  * Check and return whether first level is used by default for
1914  * DMA translation.
1915  */
1916 static bool first_level_by_default(void)
1917 {
1918         struct dmar_drhd_unit *drhd;
1919         struct intel_iommu *iommu;
1920         static int first_level_support = -1;
1921
1922         if (likely(first_level_support != -1))
1923                 return first_level_support;
1924
1925         first_level_support = 1;
1926
1927         rcu_read_lock();
1928         for_each_active_iommu(iommu, drhd) {
1929                 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1930                         first_level_support = 0;
1931                         break;
1932                 }
1933         }
1934         rcu_read_unlock();
1935
1936         return first_level_support;
1937 }
1938
1939 static struct dmar_domain *alloc_domain(int flags)
1940 {
1941         struct dmar_domain *domain;
1942
1943         domain = alloc_domain_mem();
1944         if (!domain)
1945                 return NULL;
1946
1947         memset(domain, 0, sizeof(*domain));
1948         domain->nid = NUMA_NO_NODE;
1949         domain->flags = flags;
1950         if (first_level_by_default())
1951                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1952         domain->has_iotlb_device = false;
1953         INIT_LIST_HEAD(&domain->devices);
1954
1955         return domain;
1956 }
1957
1958 /* Must be called with iommu->lock */
1959 static int domain_attach_iommu(struct dmar_domain *domain,
1960                                struct intel_iommu *iommu)
1961 {
1962         unsigned long ndomains;
1963         int num;
1964
1965         assert_spin_locked(&device_domain_lock);
1966         assert_spin_locked(&iommu->lock);
1967
1968         domain->iommu_refcnt[iommu->seq_id] += 1;
1969         domain->iommu_count += 1;
1970         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1971                 ndomains = cap_ndoms(iommu->cap);
1972                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1973
1974                 if (num >= ndomains) {
1975                         pr_err("%s: No free domain ids\n", iommu->name);
1976                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1977                         domain->iommu_count -= 1;
1978                         return -ENOSPC;
1979                 }
1980
1981                 set_bit(num, iommu->domain_ids);
1982                 set_iommu_domain(iommu, num, domain);
1983
1984                 domain->iommu_did[iommu->seq_id] = num;
1985                 domain->nid                      = iommu->node;
1986
1987                 domain_update_iommu_cap(domain);
1988         }
1989
1990         return 0;
1991 }
1992
1993 static int domain_detach_iommu(struct dmar_domain *domain,
1994                                struct intel_iommu *iommu)
1995 {
1996         int num, count;
1997
1998         assert_spin_locked(&device_domain_lock);
1999         assert_spin_locked(&iommu->lock);
2000
2001         domain->iommu_refcnt[iommu->seq_id] -= 1;
2002         count = --domain->iommu_count;
2003         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
2004                 num = domain->iommu_did[iommu->seq_id];
2005                 clear_bit(num, iommu->domain_ids);
2006                 set_iommu_domain(iommu, num, NULL);
2007
2008                 domain_update_iommu_cap(domain);
2009                 domain->iommu_did[iommu->seq_id] = 0;
2010         }
2011
2012         return count;
2013 }
2014
2015 static struct iova_domain reserved_iova_list;
2016 static struct lock_class_key reserved_rbtree_key;
2017
2018 static int dmar_init_reserved_ranges(void)
2019 {
2020         struct pci_dev *pdev = NULL;
2021         struct iova *iova;
2022         int i;
2023
2024         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
2025
2026         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
2027                 &reserved_rbtree_key);
2028
2029         /* IOAPIC ranges shouldn't be accessed by DMA */
2030         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
2031                 IOVA_PFN(IOAPIC_RANGE_END));
2032         if (!iova) {
2033                 pr_err("Reserve IOAPIC range failed\n");
2034                 return -ENODEV;
2035         }
2036
2037         /* Reserve all PCI MMIO to avoid peer-to-peer access */
2038         for_each_pci_dev(pdev) {
2039                 struct resource *r;
2040
2041                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
2042                         r = &pdev->resource[i];
2043                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
2044                                 continue;
2045                         iova = reserve_iova(&reserved_iova_list,
2046                                             IOVA_PFN(r->start),
2047                                             IOVA_PFN(r->end));
2048                         if (!iova) {
2049                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
2050                                 return -ENODEV;
2051                         }
2052                 }
2053         }
2054         return 0;
2055 }
2056
2057 static inline int guestwidth_to_adjustwidth(int gaw)
2058 {
2059         int agaw;
2060         int r = (gaw - 12) % 9;
2061
2062         if (r == 0)
2063                 agaw = gaw;
2064         else
2065                 agaw = gaw + 9 - r;
2066         if (agaw > 64)
2067                 agaw = 64;
2068         return agaw;
2069 }
2070
2071 static void domain_exit(struct dmar_domain *domain)
2072 {
2073
2074         /* Remove associated devices and clear attached or cached domains */
2075         domain_remove_dev_info(domain);
2076
2077         /* destroy iovas */
2078         if (domain->domain.type == IOMMU_DOMAIN_DMA)
2079                 put_iova_domain(&domain->iovad);
2080
2081         if (domain->pgd) {
2082                 struct page *freelist;
2083
2084                 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2085                 dma_free_pagelist(freelist);
2086         }
2087
2088         free_domain_mem(domain);
2089 }
2090
2091 /*
2092  * Get the PASID directory size for scalable mode context entry.
2093  * Value of X in the PDTS field of a scalable mode context entry
2094  * indicates PASID directory with 2^(X + 7) entries.
2095  */
2096 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2097 {
2098         int pds, max_pde;
2099
2100         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2101         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2102         if (pds < 7)
2103                 return 0;
2104
2105         return pds - 7;
2106 }
2107
2108 /*
2109  * Set the RID_PASID field of a scalable mode context entry. The
2110  * IOMMU hardware will use the PASID value set in this field for
2111  * DMA translations of DMA requests without PASID.
2112  */
2113 static inline void
2114 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2115 {
2116         context->hi |= pasid & ((1 << 20) - 1);
2117 }
2118
2119 /*
2120  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2121  * entry.
2122  */
2123 static inline void context_set_sm_dte(struct context_entry *context)
2124 {
2125         context->lo |= (1 << 2);
2126 }
2127
2128 /*
2129  * Set the PRE(Page Request Enable) field of a scalable mode context
2130  * entry.
2131  */
2132 static inline void context_set_sm_pre(struct context_entry *context)
2133 {
2134         context->lo |= (1 << 4);
2135 }
2136
2137 /* Convert value to context PASID directory size field coding. */
2138 #define context_pdts(pds)       (((pds) & 0x7) << 9)
2139
2140 static int domain_context_mapping_one(struct dmar_domain *domain,
2141                                       struct intel_iommu *iommu,
2142                                       struct pasid_table *table,
2143                                       u8 bus, u8 devfn)
2144 {
2145         u16 did = domain->iommu_did[iommu->seq_id];
2146         int translation = CONTEXT_TT_MULTI_LEVEL;
2147         struct device_domain_info *info = NULL;
2148         struct context_entry *context;
2149         unsigned long flags;
2150         int ret;
2151
2152         WARN_ON(did == 0);
2153
2154         if (hw_pass_through && domain_type_is_si(domain))
2155                 translation = CONTEXT_TT_PASS_THROUGH;
2156
2157         pr_debug("Set context mapping for %02x:%02x.%d\n",
2158                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2159
2160         BUG_ON(!domain->pgd);
2161
2162         spin_lock_irqsave(&device_domain_lock, flags);
2163         spin_lock(&iommu->lock);
2164
2165         ret = -ENOMEM;
2166         context = iommu_context_addr(iommu, bus, devfn, 1);
2167         if (!context)
2168                 goto out_unlock;
2169
2170         ret = 0;
2171         if (context_present(context))
2172                 goto out_unlock;
2173
2174         /*
2175          * For kdump cases, old valid entries may be cached due to the
2176          * in-flight DMA and copied pgtable, but there is no unmapping
2177          * behaviour for them, thus we need an explicit cache flush for
2178          * the newly-mapped device. For kdump, at this point, the device
2179          * is supposed to finish reset at its driver probe stage, so no
2180          * in-flight DMA will exist, and we don't need to worry anymore
2181          * hereafter.
2182          */
2183         if (context_copied(context)) {
2184                 u16 did_old = context_domain_id(context);
2185
2186                 if (did_old < cap_ndoms(iommu->cap)) {
2187                         iommu->flush.flush_context(iommu, did_old,
2188                                                    (((u16)bus) << 8) | devfn,
2189                                                    DMA_CCMD_MASK_NOBIT,
2190                                                    DMA_CCMD_DEVICE_INVL);
2191                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2192                                                  DMA_TLB_DSI_FLUSH);
2193                 }
2194         }
2195
2196         context_clear_entry(context);
2197
2198         if (sm_supported(iommu)) {
2199                 unsigned long pds;
2200
2201                 WARN_ON(!table);
2202
2203                 /* Setup the PASID DIR pointer: */
2204                 pds = context_get_sm_pds(table);
2205                 context->lo = (u64)virt_to_phys(table->table) |
2206                                 context_pdts(pds);
2207
2208                 /* Setup the RID_PASID field: */
2209                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2210
2211                 /*
2212                  * Setup the Device-TLB enable bit and Page request
2213                  * Enable bit:
2214                  */
2215                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2216                 if (info && info->ats_supported)
2217                         context_set_sm_dte(context);
2218                 if (info && info->pri_supported)
2219                         context_set_sm_pre(context);
2220         } else {
2221                 struct dma_pte *pgd = domain->pgd;
2222                 int agaw;
2223
2224                 context_set_domain_id(context, did);
2225
2226                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2227                         /*
2228                          * Skip top levels of page tables for iommu which has
2229                          * less agaw than default. Unnecessary for PT mode.
2230                          */
2231                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2232                                 ret = -ENOMEM;
2233                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2234                                 if (!dma_pte_present(pgd))
2235                                         goto out_unlock;
2236                         }
2237
2238                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2239                         if (info && info->ats_supported)
2240                                 translation = CONTEXT_TT_DEV_IOTLB;
2241                         else
2242                                 translation = CONTEXT_TT_MULTI_LEVEL;
2243
2244                         context_set_address_root(context, virt_to_phys(pgd));
2245                         context_set_address_width(context, agaw);
2246                 } else {
2247                         /*
2248                          * In pass through mode, AW must be programmed to
2249                          * indicate the largest AGAW value supported by
2250                          * hardware. And ASR is ignored by hardware.
2251                          */
2252                         context_set_address_width(context, iommu->msagaw);
2253                 }
2254
2255                 context_set_translation_type(context, translation);
2256         }
2257
2258         context_set_fault_enable(context);
2259         context_set_present(context);
2260         if (!ecap_coherent(iommu->ecap))
2261                 clflush_cache_range(context, sizeof(*context));
2262
2263         /*
2264          * It's a non-present to present mapping. If hardware doesn't cache
2265          * non-present entry we only need to flush the write-buffer. If the
2266          * _does_ cache non-present entries, then it does so in the special
2267          * domain #0, which we have to flush:
2268          */
2269         if (cap_caching_mode(iommu->cap)) {
2270                 iommu->flush.flush_context(iommu, 0,
2271                                            (((u16)bus) << 8) | devfn,
2272                                            DMA_CCMD_MASK_NOBIT,
2273                                            DMA_CCMD_DEVICE_INVL);
2274                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2275         } else {
2276                 iommu_flush_write_buffer(iommu);
2277         }
2278         iommu_enable_dev_iotlb(info);
2279
2280         ret = 0;
2281
2282 out_unlock:
2283         spin_unlock(&iommu->lock);
2284         spin_unlock_irqrestore(&device_domain_lock, flags);
2285
2286         return ret;
2287 }
2288
2289 struct domain_context_mapping_data {
2290         struct dmar_domain *domain;
2291         struct intel_iommu *iommu;
2292         struct pasid_table *table;
2293 };
2294
2295 static int domain_context_mapping_cb(struct pci_dev *pdev,
2296                                      u16 alias, void *opaque)
2297 {
2298         struct domain_context_mapping_data *data = opaque;
2299
2300         return domain_context_mapping_one(data->domain, data->iommu,
2301                                           data->table, PCI_BUS_NUM(alias),
2302                                           alias & 0xff);
2303 }
2304
2305 static int
2306 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2307 {
2308         struct domain_context_mapping_data data;
2309         struct pasid_table *table;
2310         struct intel_iommu *iommu;
2311         u8 bus, devfn;
2312
2313         iommu = device_to_iommu(dev, &bus, &devfn);
2314         if (!iommu)
2315                 return -ENODEV;
2316
2317         table = intel_pasid_get_table(dev);
2318
2319         if (!dev_is_pci(dev))
2320                 return domain_context_mapping_one(domain, iommu, table,
2321                                                   bus, devfn);
2322
2323         data.domain = domain;
2324         data.iommu = iommu;
2325         data.table = table;
2326
2327         return pci_for_each_dma_alias(to_pci_dev(dev),
2328                                       &domain_context_mapping_cb, &data);
2329 }
2330
2331 static int domain_context_mapped_cb(struct pci_dev *pdev,
2332                                     u16 alias, void *opaque)
2333 {
2334         struct intel_iommu *iommu = opaque;
2335
2336         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2337 }
2338
2339 static int domain_context_mapped(struct device *dev)
2340 {
2341         struct intel_iommu *iommu;
2342         u8 bus, devfn;
2343
2344         iommu = device_to_iommu(dev, &bus, &devfn);
2345         if (!iommu)
2346                 return -ENODEV;
2347
2348         if (!dev_is_pci(dev))
2349                 return device_context_mapped(iommu, bus, devfn);
2350
2351         return !pci_for_each_dma_alias(to_pci_dev(dev),
2352                                        domain_context_mapped_cb, iommu);
2353 }
2354
2355 /* Returns a number of VTD pages, but aligned to MM page size */
2356 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2357                                             size_t size)
2358 {
2359         host_addr &= ~PAGE_MASK;
2360         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2361 }
2362
2363 /* Return largest possible superpage level for a given mapping */
2364 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2365                                           unsigned long iov_pfn,
2366                                           unsigned long phy_pfn,
2367                                           unsigned long pages)
2368 {
2369         int support, level = 1;
2370         unsigned long pfnmerge;
2371
2372         support = domain->iommu_superpage;
2373
2374         /* To use a large page, the virtual *and* physical addresses
2375            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2376            of them will mean we have to use smaller pages. So just
2377            merge them and check both at once. */
2378         pfnmerge = iov_pfn | phy_pfn;
2379
2380         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2381                 pages >>= VTD_STRIDE_SHIFT;
2382                 if (!pages)
2383                         break;
2384                 pfnmerge >>= VTD_STRIDE_SHIFT;
2385                 level++;
2386                 support--;
2387         }
2388         return level;
2389 }
2390
2391 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2392                             struct scatterlist *sg, unsigned long phys_pfn,
2393                             unsigned long nr_pages, int prot)
2394 {
2395         struct dma_pte *first_pte = NULL, *pte = NULL;
2396         phys_addr_t pteval;
2397         unsigned long sg_res = 0;
2398         unsigned int largepage_lvl = 0;
2399         unsigned long lvl_pages = 0;
2400         u64 attr;
2401
2402         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2403
2404         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2405                 return -EINVAL;
2406
2407         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2408         attr |= DMA_FL_PTE_PRESENT;
2409         if (domain_use_first_level(domain)) {
2410                 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
2411
2412                 if (domain->domain.type == IOMMU_DOMAIN_DMA) {
2413                         attr |= DMA_FL_PTE_ACCESS;
2414                         if (prot & DMA_PTE_WRITE)
2415                                 attr |= DMA_FL_PTE_DIRTY;
2416                 }
2417         }
2418
2419         if (!sg) {
2420                 sg_res = nr_pages;
2421                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2422         }
2423
2424         while (nr_pages > 0) {
2425                 uint64_t tmp;
2426
2427                 if (!sg_res) {
2428                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2429
2430                         sg_res = aligned_nrpages(sg->offset, sg->length);
2431                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2432                         sg->dma_length = sg->length;
2433                         pteval = (sg_phys(sg) - pgoff) | attr;
2434                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2435                 }
2436
2437                 if (!pte) {
2438                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2439
2440                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2441                         if (!pte)
2442                                 return -ENOMEM;
2443                         /* It is large page*/
2444                         if (largepage_lvl > 1) {
2445                                 unsigned long nr_superpages, end_pfn;
2446
2447                                 pteval |= DMA_PTE_LARGE_PAGE;
2448                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2449
2450                                 nr_superpages = sg_res / lvl_pages;
2451                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2452
2453                                 /*
2454                                  * Ensure that old small page tables are
2455                                  * removed to make room for superpage(s).
2456                                  * We're adding new large pages, so make sure
2457                                  * we don't remove their parent tables.
2458                                  */
2459                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2460                                                        largepage_lvl + 1);
2461                         } else {
2462                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2463                         }
2464
2465                 }
2466                 /* We don't need lock here, nobody else
2467                  * touches the iova range
2468                  */
2469                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2470                 if (tmp) {
2471                         static int dumps = 5;
2472                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2473                                 iov_pfn, tmp, (unsigned long long)pteval);
2474                         if (dumps) {
2475                                 dumps--;
2476                                 debug_dma_dump_mappings(NULL);
2477                         }
2478                         WARN_ON(1);
2479                 }
2480
2481                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2482
2483                 BUG_ON(nr_pages < lvl_pages);
2484                 BUG_ON(sg_res < lvl_pages);
2485
2486                 nr_pages -= lvl_pages;
2487                 iov_pfn += lvl_pages;
2488                 phys_pfn += lvl_pages;
2489                 pteval += lvl_pages * VTD_PAGE_SIZE;
2490                 sg_res -= lvl_pages;
2491
2492                 /* If the next PTE would be the first in a new page, then we
2493                    need to flush the cache on the entries we've just written.
2494                    And then we'll need to recalculate 'pte', so clear it and
2495                    let it get set again in the if (!pte) block above.
2496
2497                    If we're done (!nr_pages) we need to flush the cache too.
2498
2499                    Also if we've been setting superpages, we may need to
2500                    recalculate 'pte' and switch back to smaller pages for the
2501                    end of the mapping, if the trailing size is not enough to
2502                    use another superpage (i.e. sg_res < lvl_pages). */
2503                 pte++;
2504                 if (!nr_pages || first_pte_in_page(pte) ||
2505                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2506                         domain_flush_cache(domain, first_pte,
2507                                            (void *)pte - (void *)first_pte);
2508                         pte = NULL;
2509                 }
2510
2511                 if (!sg_res && nr_pages)
2512                         sg = sg_next(sg);
2513         }
2514         return 0;
2515 }
2516
2517 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2518                           struct scatterlist *sg, unsigned long phys_pfn,
2519                           unsigned long nr_pages, int prot)
2520 {
2521         int iommu_id, ret;
2522         struct intel_iommu *iommu;
2523
2524         /* Do the real mapping first */
2525         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2526         if (ret)
2527                 return ret;
2528
2529         for_each_domain_iommu(iommu_id, domain) {
2530                 iommu = g_iommus[iommu_id];
2531                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2532         }
2533
2534         return 0;
2535 }
2536
2537 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2538                                     struct scatterlist *sg, unsigned long nr_pages,
2539                                     int prot)
2540 {
2541         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2542 }
2543
2544 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2545                                      unsigned long phys_pfn, unsigned long nr_pages,
2546                                      int prot)
2547 {
2548         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2549 }
2550
2551 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2552 {
2553         unsigned long flags;
2554         struct context_entry *context;
2555         u16 did_old;
2556
2557         if (!iommu)
2558                 return;
2559
2560         spin_lock_irqsave(&iommu->lock, flags);
2561         context = iommu_context_addr(iommu, bus, devfn, 0);
2562         if (!context) {
2563                 spin_unlock_irqrestore(&iommu->lock, flags);
2564                 return;
2565         }
2566         did_old = context_domain_id(context);
2567         context_clear_entry(context);
2568         __iommu_flush_cache(iommu, context, sizeof(*context));
2569         spin_unlock_irqrestore(&iommu->lock, flags);
2570         iommu->flush.flush_context(iommu,
2571                                    did_old,
2572                                    (((u16)bus) << 8) | devfn,
2573                                    DMA_CCMD_MASK_NOBIT,
2574                                    DMA_CCMD_DEVICE_INVL);
2575
2576         if (sm_supported(iommu))
2577                 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2578
2579         iommu->flush.flush_iotlb(iommu,
2580                                  did_old,
2581                                  0,
2582                                  0,
2583                                  DMA_TLB_DSI_FLUSH);
2584 }
2585
2586 static inline void unlink_domain_info(struct device_domain_info *info)
2587 {
2588         assert_spin_locked(&device_domain_lock);
2589         list_del(&info->link);
2590         list_del(&info->global);
2591         if (info->dev)
2592                 dev_iommu_priv_set(info->dev, NULL);
2593 }
2594
2595 static void domain_remove_dev_info(struct dmar_domain *domain)
2596 {
2597         struct device_domain_info *info, *tmp;
2598         unsigned long flags;
2599
2600         spin_lock_irqsave(&device_domain_lock, flags);
2601         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2602                 __dmar_remove_one_dev_info(info);
2603         spin_unlock_irqrestore(&device_domain_lock, flags);
2604 }
2605
2606 struct dmar_domain *find_domain(struct device *dev)
2607 {
2608         struct device_domain_info *info;
2609
2610         if (unlikely(!dev || !dev->iommu))
2611                 return NULL;
2612
2613         if (unlikely(attach_deferred(dev)))
2614                 return NULL;
2615
2616         /* No lock here, assumes no domain exit in normal case */
2617         info = get_domain_info(dev);
2618         if (likely(info))
2619                 return info->domain;
2620
2621         return NULL;
2622 }
2623
2624 static void do_deferred_attach(struct device *dev)
2625 {
2626         struct iommu_domain *domain;
2627
2628         dev_iommu_priv_set(dev, NULL);
2629         domain = iommu_get_domain_for_dev(dev);
2630         if (domain)
2631                 intel_iommu_attach_device(domain, dev);
2632 }
2633
2634 static inline struct device_domain_info *
2635 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2636 {
2637         struct device_domain_info *info;
2638
2639         list_for_each_entry(info, &device_domain_list, global)
2640                 if (info->segment == segment && info->bus == bus &&
2641                     info->devfn == devfn)
2642                         return info;
2643
2644         return NULL;
2645 }
2646
2647 static int domain_setup_first_level(struct intel_iommu *iommu,
2648                                     struct dmar_domain *domain,
2649                                     struct device *dev,
2650                                     u32 pasid)
2651 {
2652         struct dma_pte *pgd = domain->pgd;
2653         int agaw, level;
2654         int flags = 0;
2655
2656         /*
2657          * Skip top levels of page tables for iommu which has
2658          * less agaw than default. Unnecessary for PT mode.
2659          */
2660         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2661                 pgd = phys_to_virt(dma_pte_addr(pgd));
2662                 if (!dma_pte_present(pgd))
2663                         return -ENOMEM;
2664         }
2665
2666         level = agaw_to_level(agaw);
2667         if (level != 4 && level != 5)
2668                 return -EINVAL;
2669
2670         if (pasid != PASID_RID2PASID)
2671                 flags |= PASID_FLAG_SUPERVISOR_MODE;
2672         if (level == 5)
2673                 flags |= PASID_FLAG_FL5LP;
2674
2675         if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
2676                 flags |= PASID_FLAG_PAGE_SNOOP;
2677
2678         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2679                                              domain->iommu_did[iommu->seq_id],
2680                                              flags);
2681 }
2682
2683 static bool dev_is_real_dma_subdevice(struct device *dev)
2684 {
2685         return dev && dev_is_pci(dev) &&
2686                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2687 }
2688
2689 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2690                                                     int bus, int devfn,
2691                                                     struct device *dev,
2692                                                     struct dmar_domain *domain)
2693 {
2694         struct dmar_domain *found = NULL;
2695         struct device_domain_info *info;
2696         unsigned long flags;
2697         int ret;
2698
2699         info = alloc_devinfo_mem();
2700         if (!info)
2701                 return NULL;
2702
2703         if (!dev_is_real_dma_subdevice(dev)) {
2704                 info->bus = bus;
2705                 info->devfn = devfn;
2706                 info->segment = iommu->segment;
2707         } else {
2708                 struct pci_dev *pdev = to_pci_dev(dev);
2709
2710                 info->bus = pdev->bus->number;
2711                 info->devfn = pdev->devfn;
2712                 info->segment = pci_domain_nr(pdev->bus);
2713         }
2714
2715         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2716         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2717         info->ats_qdep = 0;
2718         info->dev = dev;
2719         info->domain = domain;
2720         info->iommu = iommu;
2721         info->pasid_table = NULL;
2722         info->auxd_enabled = 0;
2723         INIT_LIST_HEAD(&info->auxiliary_domains);
2724
2725         if (dev && dev_is_pci(dev)) {
2726                 struct pci_dev *pdev = to_pci_dev(info->dev);
2727
2728                 if (ecap_dev_iotlb_support(iommu->ecap) &&
2729                     pci_ats_supported(pdev) &&
2730                     dmar_find_matched_atsr_unit(pdev))
2731                         info->ats_supported = 1;
2732
2733                 if (sm_supported(iommu)) {
2734                         if (pasid_supported(iommu)) {
2735                                 int features = pci_pasid_features(pdev);
2736                                 if (features >= 0)
2737                                         info->pasid_supported = features | 1;
2738                         }
2739
2740                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2741                             pci_pri_supported(pdev))
2742                                 info->pri_supported = 1;
2743                 }
2744         }
2745
2746         spin_lock_irqsave(&device_domain_lock, flags);
2747         if (dev)
2748                 found = find_domain(dev);
2749
2750         if (!found) {
2751                 struct device_domain_info *info2;
2752                 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2753                                                        info->devfn);
2754                 if (info2) {
2755                         found      = info2->domain;
2756                         info2->dev = dev;
2757                 }
2758         }
2759
2760         if (found) {
2761                 spin_unlock_irqrestore(&device_domain_lock, flags);
2762                 free_devinfo_mem(info);
2763                 /* Caller must free the original domain */
2764                 return found;
2765         }
2766
2767         spin_lock(&iommu->lock);
2768         ret = domain_attach_iommu(domain, iommu);
2769         spin_unlock(&iommu->lock);
2770
2771         if (ret) {
2772                 spin_unlock_irqrestore(&device_domain_lock, flags);
2773                 free_devinfo_mem(info);
2774                 return NULL;
2775         }
2776
2777         list_add(&info->link, &domain->devices);
2778         list_add(&info->global, &device_domain_list);
2779         if (dev)
2780                 dev_iommu_priv_set(dev, info);
2781         spin_unlock_irqrestore(&device_domain_lock, flags);
2782
2783         /* PASID table is mandatory for a PCI device in scalable mode. */
2784         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2785                 ret = intel_pasid_alloc_table(dev);
2786                 if (ret) {
2787                         dev_err(dev, "PASID table allocation failed\n");
2788                         dmar_remove_one_dev_info(dev);
2789                         return NULL;
2790                 }
2791
2792                 /* Setup the PASID entry for requests without PASID: */
2793                 spin_lock_irqsave(&iommu->lock, flags);
2794                 if (hw_pass_through && domain_type_is_si(domain))
2795                         ret = intel_pasid_setup_pass_through(iommu, domain,
2796                                         dev, PASID_RID2PASID);
2797                 else if (domain_use_first_level(domain))
2798                         ret = domain_setup_first_level(iommu, domain, dev,
2799                                         PASID_RID2PASID);
2800                 else
2801                         ret = intel_pasid_setup_second_level(iommu, domain,
2802                                         dev, PASID_RID2PASID);
2803                 spin_unlock_irqrestore(&iommu->lock, flags);
2804                 if (ret) {
2805                         dev_err(dev, "Setup RID2PASID failed\n");
2806                         dmar_remove_one_dev_info(dev);
2807                         return NULL;
2808                 }
2809         }
2810
2811         if (dev && domain_context_mapping(domain, dev)) {
2812                 dev_err(dev, "Domain context map failed\n");
2813                 dmar_remove_one_dev_info(dev);
2814                 return NULL;
2815         }
2816
2817         return domain;
2818 }
2819
2820 static int iommu_domain_identity_map(struct dmar_domain *domain,
2821                                      unsigned long first_vpfn,
2822                                      unsigned long last_vpfn)
2823 {
2824         /*
2825          * RMRR range might have overlap with physical memory range,
2826          * clear it first
2827          */
2828         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2829
2830         return __domain_mapping(domain, first_vpfn, NULL,
2831                                 first_vpfn, last_vpfn - first_vpfn + 1,
2832                                 DMA_PTE_READ|DMA_PTE_WRITE);
2833 }
2834
2835 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2836
2837 static int __init si_domain_init(int hw)
2838 {
2839         struct dmar_rmrr_unit *rmrr;
2840         struct device *dev;
2841         int i, nid, ret;
2842
2843         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2844         if (!si_domain)
2845                 return -EFAULT;
2846
2847         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2848                 domain_exit(si_domain);
2849                 si_domain = NULL;
2850                 return -EFAULT;
2851         }
2852
2853         if (hw)
2854                 return 0;
2855
2856         for_each_online_node(nid) {
2857                 unsigned long start_pfn, end_pfn;
2858                 int i;
2859
2860                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2861                         ret = iommu_domain_identity_map(si_domain,
2862                                         mm_to_dma_pfn(start_pfn),
2863                                         mm_to_dma_pfn(end_pfn));
2864                         if (ret)
2865                                 return ret;
2866                 }
2867         }
2868
2869         /*
2870          * Identity map the RMRRs so that devices with RMRRs could also use
2871          * the si_domain.
2872          */
2873         for_each_rmrr_units(rmrr) {
2874                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2875                                           i, dev) {
2876                         unsigned long long start = rmrr->base_address;
2877                         unsigned long long end = rmrr->end_address;
2878
2879                         if (WARN_ON(end < start ||
2880                                     end >> agaw_to_width(si_domain->agaw)))
2881                                 continue;
2882
2883                         ret = iommu_domain_identity_map(si_domain,
2884                                         mm_to_dma_pfn(start >> PAGE_SHIFT),
2885                                         mm_to_dma_pfn(end >> PAGE_SHIFT));
2886                         if (ret)
2887                                 return ret;
2888                 }
2889         }
2890
2891         return 0;
2892 }
2893
2894 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2895 {
2896         struct dmar_domain *ndomain;
2897         struct intel_iommu *iommu;
2898         u8 bus, devfn;
2899
2900         iommu = device_to_iommu(dev, &bus, &devfn);
2901         if (!iommu)
2902                 return -ENODEV;
2903
2904         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2905         if (ndomain != domain)
2906                 return -EBUSY;
2907
2908         return 0;
2909 }
2910
2911 static bool device_has_rmrr(struct device *dev)
2912 {
2913         struct dmar_rmrr_unit *rmrr;
2914         struct device *tmp;
2915         int i;
2916
2917         rcu_read_lock();
2918         for_each_rmrr_units(rmrr) {
2919                 /*
2920                  * Return TRUE if this RMRR contains the device that
2921                  * is passed in.
2922                  */
2923                 for_each_active_dev_scope(rmrr->devices,
2924                                           rmrr->devices_cnt, i, tmp)
2925                         if (tmp == dev ||
2926                             is_downstream_to_pci_bridge(dev, tmp)) {
2927                                 rcu_read_unlock();
2928                                 return true;
2929                         }
2930         }
2931         rcu_read_unlock();
2932         return false;
2933 }
2934
2935 /**
2936  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2937  * is relaxable (ie. is allowed to be not enforced under some conditions)
2938  * @dev: device handle
2939  *
2940  * We assume that PCI USB devices with RMRRs have them largely
2941  * for historical reasons and that the RMRR space is not actively used post
2942  * boot.  This exclusion may change if vendors begin to abuse it.
2943  *
2944  * The same exception is made for graphics devices, with the requirement that
2945  * any use of the RMRR regions will be torn down before assigning the device
2946  * to a guest.
2947  *
2948  * Return: true if the RMRR is relaxable, false otherwise
2949  */
2950 static bool device_rmrr_is_relaxable(struct device *dev)
2951 {
2952         struct pci_dev *pdev;
2953
2954         if (!dev_is_pci(dev))
2955                 return false;
2956
2957         pdev = to_pci_dev(dev);
2958         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2959                 return true;
2960         else
2961                 return false;
2962 }
2963
2964 /*
2965  * There are a couple cases where we need to restrict the functionality of
2966  * devices associated with RMRRs.  The first is when evaluating a device for
2967  * identity mapping because problems exist when devices are moved in and out
2968  * of domains and their respective RMRR information is lost.  This means that
2969  * a device with associated RMRRs will never be in a "passthrough" domain.
2970  * The second is use of the device through the IOMMU API.  This interface
2971  * expects to have full control of the IOVA space for the device.  We cannot
2972  * satisfy both the requirement that RMRR access is maintained and have an
2973  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2974  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2975  * We therefore prevent devices associated with an RMRR from participating in
2976  * the IOMMU API, which eliminates them from device assignment.
2977  *
2978  * In both cases, devices which have relaxable RMRRs are not concerned by this
2979  * restriction. See device_rmrr_is_relaxable comment.
2980  */
2981 static bool device_is_rmrr_locked(struct device *dev)
2982 {
2983         if (!device_has_rmrr(dev))
2984                 return false;
2985
2986         if (device_rmrr_is_relaxable(dev))
2987                 return false;
2988
2989         return true;
2990 }
2991
2992 /*
2993  * Return the required default domain type for a specific device.
2994  *
2995  * @dev: the device in query
2996  * @startup: true if this is during early boot
2997  *
2998  * Returns:
2999  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
3000  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
3001  *  - 0: both identity and dynamic domains work for this device
3002  */
3003 static int device_def_domain_type(struct device *dev)
3004 {
3005         if (dev_is_pci(dev)) {
3006                 struct pci_dev *pdev = to_pci_dev(dev);
3007
3008                 /*
3009                  * Prevent any device marked as untrusted from getting
3010                  * placed into the statically identity mapping domain.
3011                  */
3012                 if (pdev->untrusted)
3013                         return IOMMU_DOMAIN_DMA;
3014
3015                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
3016                         return IOMMU_DOMAIN_IDENTITY;
3017
3018                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
3019                         return IOMMU_DOMAIN_IDENTITY;
3020         }
3021
3022         return 0;
3023 }
3024
3025 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3026 {
3027         /*
3028          * Start from the sane iommu hardware state.
3029          * If the queued invalidation is already initialized by us
3030          * (for example, while enabling interrupt-remapping) then
3031          * we got the things already rolling from a sane state.
3032          */
3033         if (!iommu->qi) {
3034                 /*
3035                  * Clear any previous faults.
3036                  */
3037                 dmar_fault(-1, iommu);
3038                 /*
3039                  * Disable queued invalidation if supported and already enabled
3040                  * before OS handover.
3041                  */
3042                 dmar_disable_qi(iommu);
3043         }
3044
3045         if (dmar_enable_qi(iommu)) {
3046                 /*
3047                  * Queued Invalidate not enabled, use Register Based Invalidate
3048                  */
3049                 iommu->flush.flush_context = __iommu_flush_context;
3050                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3051                 pr_info("%s: Using Register based invalidation\n",
3052                         iommu->name);
3053         } else {
3054                 iommu->flush.flush_context = qi_flush_context;
3055                 iommu->flush.flush_iotlb = qi_flush_iotlb;
3056                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3057         }
3058 }
3059
3060 static int copy_context_table(struct intel_iommu *iommu,
3061                               struct root_entry *old_re,
3062                               struct context_entry **tbl,
3063                               int bus, bool ext)
3064 {
3065         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3066         struct context_entry *new_ce = NULL, ce;
3067         struct context_entry *old_ce = NULL;
3068         struct root_entry re;
3069         phys_addr_t old_ce_phys;
3070
3071         tbl_idx = ext ? bus * 2 : bus;
3072         memcpy(&re, old_re, sizeof(re));
3073
3074         for (devfn = 0; devfn < 256; devfn++) {
3075                 /* First calculate the correct index */
3076                 idx = (ext ? devfn * 2 : devfn) % 256;
3077
3078                 if (idx == 0) {
3079                         /* First save what we may have and clean up */
3080                         if (new_ce) {
3081                                 tbl[tbl_idx] = new_ce;
3082                                 __iommu_flush_cache(iommu, new_ce,
3083                                                     VTD_PAGE_SIZE);
3084                                 pos = 1;
3085                         }
3086
3087                         if (old_ce)
3088                                 memunmap(old_ce);
3089
3090                         ret = 0;
3091                         if (devfn < 0x80)
3092                                 old_ce_phys = root_entry_lctp(&re);
3093                         else
3094                                 old_ce_phys = root_entry_uctp(&re);
3095
3096                         if (!old_ce_phys) {
3097                                 if (ext && devfn == 0) {
3098                                         /* No LCTP, try UCTP */
3099                                         devfn = 0x7f;
3100                                         continue;
3101                                 } else {
3102                                         goto out;
3103                                 }
3104                         }
3105
3106                         ret = -ENOMEM;
3107                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3108                                         MEMREMAP_WB);
3109                         if (!old_ce)
3110                                 goto out;
3111
3112                         new_ce = alloc_pgtable_page(iommu->node);
3113                         if (!new_ce)
3114                                 goto out_unmap;
3115
3116                         ret = 0;
3117                 }
3118
3119                 /* Now copy the context entry */
3120                 memcpy(&ce, old_ce + idx, sizeof(ce));
3121
3122                 if (!__context_present(&ce))
3123                         continue;
3124
3125                 did = context_domain_id(&ce);
3126                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3127                         set_bit(did, iommu->domain_ids);
3128
3129                 /*
3130                  * We need a marker for copied context entries. This
3131                  * marker needs to work for the old format as well as
3132                  * for extended context entries.
3133                  *
3134                  * Bit 67 of the context entry is used. In the old
3135                  * format this bit is available to software, in the
3136                  * extended format it is the PGE bit, but PGE is ignored
3137                  * by HW if PASIDs are disabled (and thus still
3138                  * available).
3139                  *
3140                  * So disable PASIDs first and then mark the entry
3141                  * copied. This means that we don't copy PASID
3142                  * translations from the old kernel, but this is fine as
3143                  * faults there are not fatal.
3144                  */
3145                 context_clear_pasid_enable(&ce);
3146                 context_set_copied(&ce);
3147
3148                 new_ce[idx] = ce;
3149         }
3150
3151         tbl[tbl_idx + pos] = new_ce;
3152
3153         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3154
3155 out_unmap:
3156         memunmap(old_ce);
3157
3158 out:
3159         return ret;
3160 }
3161
3162 static int copy_translation_tables(struct intel_iommu *iommu)
3163 {
3164         struct context_entry **ctxt_tbls;
3165         struct root_entry *old_rt;
3166         phys_addr_t old_rt_phys;
3167         int ctxt_table_entries;
3168         unsigned long flags;
3169         u64 rtaddr_reg;
3170         int bus, ret;
3171         bool new_ext, ext;
3172
3173         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3174         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3175         new_ext    = !!ecap_ecs(iommu->ecap);
3176
3177         /*
3178          * The RTT bit can only be changed when translation is disabled,
3179          * but disabling translation means to open a window for data
3180          * corruption. So bail out and don't copy anything if we would
3181          * have to change the bit.
3182          */
3183         if (new_ext != ext)
3184                 return -EINVAL;
3185
3186         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3187         if (!old_rt_phys)
3188                 return -EINVAL;
3189
3190         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3191         if (!old_rt)
3192                 return -ENOMEM;
3193
3194         /* This is too big for the stack - allocate it from slab */
3195         ctxt_table_entries = ext ? 512 : 256;
3196         ret = -ENOMEM;
3197         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3198         if (!ctxt_tbls)
3199                 goto out_unmap;
3200
3201         for (bus = 0; bus < 256; bus++) {
3202                 ret = copy_context_table(iommu, &old_rt[bus],
3203                                          ctxt_tbls, bus, ext);
3204                 if (ret) {
3205                         pr_err("%s: Failed to copy context table for bus %d\n",
3206                                 iommu->name, bus);
3207                         continue;
3208                 }
3209         }
3210
3211         spin_lock_irqsave(&iommu->lock, flags);
3212
3213         /* Context tables are copied, now write them to the root_entry table */
3214         for (bus = 0; bus < 256; bus++) {
3215                 int idx = ext ? bus * 2 : bus;
3216                 u64 val;
3217
3218                 if (ctxt_tbls[idx]) {
3219                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3220                         iommu->root_entry[bus].lo = val;
3221                 }
3222
3223                 if (!ext || !ctxt_tbls[idx + 1])
3224                         continue;
3225
3226                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3227                 iommu->root_entry[bus].hi = val;
3228         }
3229
3230         spin_unlock_irqrestore(&iommu->lock, flags);
3231
3232         kfree(ctxt_tbls);
3233
3234         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3235
3236         ret = 0;
3237
3238 out_unmap:
3239         memunmap(old_rt);
3240
3241         return ret;
3242 }
3243
3244 #ifdef CONFIG_INTEL_IOMMU_SVM
3245 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3246 {
3247         struct intel_iommu *iommu = data;
3248         ioasid_t ioasid;
3249
3250         if (!iommu)
3251                 return INVALID_IOASID;
3252         /*
3253          * VT-d virtual command interface always uses the full 20 bit
3254          * PASID range. Host can partition guest PASID range based on
3255          * policies but it is out of guest's control.
3256          */
3257         if (min < PASID_MIN || max > intel_pasid_max_id)
3258                 return INVALID_IOASID;
3259
3260         if (vcmd_alloc_pasid(iommu, &ioasid))
3261                 return INVALID_IOASID;
3262
3263         return ioasid;
3264 }
3265
3266 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3267 {
3268         struct intel_iommu *iommu = data;
3269
3270         if (!iommu)
3271                 return;
3272         /*
3273          * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3274          * We can only free the PASID when all the devices are unbound.
3275          */
3276         if (ioasid_find(NULL, ioasid, NULL)) {
3277                 pr_alert("Cannot free active IOASID %d\n", ioasid);
3278                 return;
3279         }
3280         vcmd_free_pasid(iommu, ioasid);
3281 }
3282
3283 static void register_pasid_allocator(struct intel_iommu *iommu)
3284 {
3285         /*
3286          * If we are running in the host, no need for custom allocator
3287          * in that PASIDs are allocated from the host system-wide.
3288          */
3289         if (!cap_caching_mode(iommu->cap))
3290                 return;
3291
3292         if (!sm_supported(iommu)) {
3293                 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3294                 return;
3295         }
3296
3297         /*
3298          * Register a custom PASID allocator if we are running in a guest,
3299          * guest PASID must be obtained via virtual command interface.
3300          * There can be multiple vIOMMUs in each guest but only one allocator
3301          * is active. All vIOMMU allocators will eventually be calling the same
3302          * host allocator.
3303          */
3304         if (!vccap_pasid(iommu->vccap))
3305                 return;
3306
3307         pr_info("Register custom PASID allocator\n");
3308         iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3309         iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3310         iommu->pasid_allocator.pdata = (void *)iommu;
3311         if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3312                 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3313                 /*
3314                  * Disable scalable mode on this IOMMU if there
3315                  * is no custom allocator. Mixing SM capable vIOMMU
3316                  * and non-SM vIOMMU are not supported.
3317                  */
3318                 intel_iommu_sm = 0;
3319         }
3320 }
3321 #endif
3322
3323 static int __init init_dmars(void)
3324 {
3325         struct dmar_drhd_unit *drhd;
3326         struct intel_iommu *iommu;
3327         int ret;
3328
3329         /*
3330          * for each drhd
3331          *    allocate root
3332          *    initialize and program root entry to not present
3333          * endfor
3334          */
3335         for_each_drhd_unit(drhd) {
3336                 /*
3337                  * lock not needed as this is only incremented in the single
3338                  * threaded kernel __init code path all other access are read
3339                  * only
3340                  */
3341                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3342                         g_num_of_iommus++;
3343                         continue;
3344                 }
3345                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3346         }
3347
3348         /* Preallocate enough resources for IOMMU hot-addition */
3349         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3350                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3351
3352         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3353                         GFP_KERNEL);
3354         if (!g_iommus) {
3355                 pr_err("Allocating global iommu array failed\n");
3356                 ret = -ENOMEM;
3357                 goto error;
3358         }
3359
3360         for_each_iommu(iommu, drhd) {
3361                 if (drhd->ignored) {
3362                         iommu_disable_translation(iommu);
3363                         continue;
3364                 }
3365
3366                 /*
3367                  * Find the max pasid size of all IOMMU's in the system.
3368                  * We need to ensure the system pasid table is no bigger
3369                  * than the smallest supported.
3370                  */
3371                 if (pasid_supported(iommu)) {
3372                         u32 temp = 2 << ecap_pss(iommu->ecap);
3373
3374                         intel_pasid_max_id = min_t(u32, temp,
3375                                                    intel_pasid_max_id);
3376                 }
3377
3378                 g_iommus[iommu->seq_id] = iommu;
3379
3380                 intel_iommu_init_qi(iommu);
3381
3382                 ret = iommu_init_domains(iommu);
3383                 if (ret)
3384                         goto free_iommu;
3385
3386                 init_translation_status(iommu);
3387
3388                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3389                         iommu_disable_translation(iommu);
3390                         clear_translation_pre_enabled(iommu);
3391                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3392                                 iommu->name);
3393                 }
3394
3395                 /*
3396                  * TBD:
3397                  * we could share the same root & context tables
3398                  * among all IOMMU's. Need to Split it later.
3399                  */
3400                 ret = iommu_alloc_root_entry(iommu);
3401                 if (ret)
3402                         goto free_iommu;
3403
3404                 if (translation_pre_enabled(iommu)) {
3405                         pr_info("Translation already enabled - trying to copy translation structures\n");
3406
3407                         ret = copy_translation_tables(iommu);
3408                         if (ret) {
3409                                 /*
3410                                  * We found the IOMMU with translation
3411                                  * enabled - but failed to copy over the
3412                                  * old root-entry table. Try to proceed
3413                                  * by disabling translation now and
3414                                  * allocating a clean root-entry table.
3415                                  * This might cause DMAR faults, but
3416                                  * probably the dump will still succeed.
3417                                  */
3418                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3419                                        iommu->name);
3420                                 iommu_disable_translation(iommu);
3421                                 clear_translation_pre_enabled(iommu);
3422                         } else {
3423                                 pr_info("Copied translation tables from previous kernel for %s\n",
3424                                         iommu->name);
3425                         }
3426                 }
3427
3428                 if (!ecap_pass_through(iommu->ecap))
3429                         hw_pass_through = 0;
3430
3431                 if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) {
3432                         pr_warn("Disable batched IOTLB flush due to virtualization");
3433                         intel_iommu_strict = 1;
3434                 }
3435                 intel_svm_check(iommu);
3436         }
3437
3438         /*
3439          * Now that qi is enabled on all iommus, set the root entry and flush
3440          * caches. This is required on some Intel X58 chipsets, otherwise the
3441          * flush_context function will loop forever and the boot hangs.
3442          */
3443         for_each_active_iommu(iommu, drhd) {
3444                 iommu_flush_write_buffer(iommu);
3445 #ifdef CONFIG_INTEL_IOMMU_SVM
3446                 register_pasid_allocator(iommu);
3447 #endif
3448                 iommu_set_root_entry(iommu);
3449         }
3450
3451 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3452         dmar_map_gfx = 0;
3453 #endif
3454
3455         if (!dmar_map_gfx)
3456                 iommu_identity_mapping |= IDENTMAP_GFX;
3457
3458         check_tylersburg_isoch();
3459
3460         ret = si_domain_init(hw_pass_through);
3461         if (ret)
3462                 goto free_iommu;
3463
3464         /*
3465          * for each drhd
3466          *   enable fault log
3467          *   global invalidate context cache
3468          *   global invalidate iotlb
3469          *   enable translation
3470          */
3471         for_each_iommu(iommu, drhd) {
3472                 if (drhd->ignored) {
3473                         /*
3474                          * we always have to disable PMRs or DMA may fail on
3475                          * this device
3476                          */
3477                         if (force_on)
3478                                 iommu_disable_protect_mem_regions(iommu);
3479                         continue;
3480                 }
3481
3482                 iommu_flush_write_buffer(iommu);
3483
3484 #ifdef CONFIG_INTEL_IOMMU_SVM
3485                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3486                         /*
3487                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3488                          * could cause possible lock race condition.
3489                          */
3490                         up_write(&dmar_global_lock);
3491                         ret = intel_svm_enable_prq(iommu);
3492                         down_write(&dmar_global_lock);
3493                         if (ret)
3494                                 goto free_iommu;
3495                 }
3496 #endif
3497                 ret = dmar_set_interrupt(iommu);
3498                 if (ret)
3499                         goto free_iommu;
3500         }
3501
3502         return 0;
3503
3504 free_iommu:
3505         for_each_active_iommu(iommu, drhd) {
3506                 disable_dmar_iommu(iommu);
3507                 free_dmar_iommu(iommu);
3508         }
3509         if (si_domain) {
3510                 domain_exit(si_domain);
3511                 si_domain = NULL;
3512         }
3513
3514         kfree(g_iommus);
3515
3516 error:
3517         return ret;
3518 }
3519
3520 /* This takes a number of _MM_ pages, not VTD pages */
3521 static unsigned long intel_alloc_iova(struct device *dev,
3522                                      struct dmar_domain *domain,
3523                                      unsigned long nrpages, uint64_t dma_mask)
3524 {
3525         unsigned long iova_pfn;
3526
3527         /*
3528          * Restrict dma_mask to the width that the iommu can handle.
3529          * First-level translation restricts the input-address to a
3530          * canonical address (i.e., address bits 63:N have the same
3531          * value as address bit [N-1], where N is 48-bits with 4-level
3532          * paging and 57-bits with 5-level paging). Hence, skip bit
3533          * [N-1].
3534          */
3535         if (domain_use_first_level(domain))
3536                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3537                                  dma_mask);
3538         else
3539                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3540                                  dma_mask);
3541
3542         /* Ensure we reserve the whole size-aligned region */
3543         nrpages = __roundup_pow_of_two(nrpages);
3544
3545         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3546                 /*
3547                  * First try to allocate an io virtual address in
3548                  * DMA_BIT_MASK(32) and if that fails then try allocating
3549                  * from higher range
3550                  */
3551                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3552                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3553                 if (iova_pfn)
3554                         return iova_pfn;
3555         }
3556         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3557                                    IOVA_PFN(dma_mask), true);
3558         if (unlikely(!iova_pfn)) {
3559                 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3560                              nrpages);
3561                 return 0;
3562         }
3563
3564         return iova_pfn;
3565 }
3566
3567 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3568                                      size_t size, int dir, u64 dma_mask)
3569 {
3570         struct dmar_domain *domain;
3571         phys_addr_t start_paddr;
3572         unsigned long iova_pfn;
3573         int prot = 0;
3574         int ret;
3575         struct intel_iommu *iommu;
3576         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3577
3578         BUG_ON(dir == DMA_NONE);
3579
3580         if (unlikely(attach_deferred(dev)))
3581                 do_deferred_attach(dev);
3582
3583         domain = find_domain(dev);
3584         if (!domain)
3585                 return DMA_MAPPING_ERROR;
3586
3587         iommu = domain_get_iommu(domain);
3588         size = aligned_nrpages(paddr, size);
3589
3590         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3591         if (!iova_pfn)
3592                 goto error;
3593
3594         /*
3595          * Check if DMAR supports zero-length reads on write only
3596          * mappings..
3597          */
3598         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3599                         !cap_zlr(iommu->cap))
3600                 prot |= DMA_PTE_READ;
3601         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3602                 prot |= DMA_PTE_WRITE;
3603         /*
3604          * paddr - (paddr + size) might be partial page, we should map the whole
3605          * page.  Note: if two part of one page are separately mapped, we
3606          * might have two guest_addr mapping to the same host paddr, but this
3607          * is not a big problem
3608          */
3609         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3610                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3611         if (ret)
3612                 goto error;
3613
3614         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3615         start_paddr += paddr & ~PAGE_MASK;
3616
3617         trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3618
3619         return start_paddr;
3620
3621 error:
3622         if (iova_pfn)
3623                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3624         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3625                 size, (unsigned long long)paddr, dir);
3626         return DMA_MAPPING_ERROR;
3627 }
3628
3629 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3630                                  unsigned long offset, size_t size,
3631                                  enum dma_data_direction dir,
3632                                  unsigned long attrs)
3633 {
3634         return __intel_map_single(dev, page_to_phys(page) + offset,
3635                                   size, dir, *dev->dma_mask);
3636 }
3637
3638 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3639                                      size_t size, enum dma_data_direction dir,
3640                                      unsigned long attrs)
3641 {
3642         return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3643 }
3644
3645 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3646 {
3647         struct dmar_domain *domain;
3648         unsigned long start_pfn, last_pfn;
3649         unsigned long nrpages;
3650         unsigned long iova_pfn;
3651         struct intel_iommu *iommu;
3652         struct page *freelist;
3653         struct pci_dev *pdev = NULL;
3654
3655         domain = find_domain(dev);
3656         BUG_ON(!domain);
3657
3658         iommu = domain_get_iommu(domain);
3659
3660         iova_pfn = IOVA_PFN(dev_addr);
3661
3662         nrpages = aligned_nrpages(dev_addr, size);
3663         start_pfn = mm_to_dma_pfn(iova_pfn);
3664         last_pfn = start_pfn + nrpages - 1;
3665
3666         if (dev_is_pci(dev))
3667                 pdev = to_pci_dev(dev);
3668
3669         freelist = domain_unmap(domain, start_pfn, last_pfn);
3670         if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3671                         !has_iova_flush_queue(&domain->iovad)) {
3672                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3673                                       nrpages, !freelist, 0);
3674                 /* free iova */
3675                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3676                 dma_free_pagelist(freelist);
3677         } else {
3678                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3679                            (unsigned long)freelist);
3680                 /*
3681                  * queue up the release of the unmap to save the 1/6th of the
3682                  * cpu used up by the iotlb flush operation...
3683                  */
3684         }
3685
3686         trace_unmap_single(dev, dev_addr, size);
3687 }
3688
3689 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3690                              size_t size, enum dma_data_direction dir,
3691                              unsigned long attrs)
3692 {
3693         intel_unmap(dev, dev_addr, size);
3694 }
3695
3696 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3697                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3698 {
3699         intel_unmap(dev, dev_addr, size);
3700 }
3701
3702 static void *intel_alloc_coherent(struct device *dev, size_t size,
3703                                   dma_addr_t *dma_handle, gfp_t flags,
3704                                   unsigned long attrs)
3705 {
3706         struct page *page = NULL;
3707         int order;
3708
3709         if (unlikely(attach_deferred(dev)))
3710                 do_deferred_attach(dev);
3711
3712         size = PAGE_ALIGN(size);
3713         order = get_order(size);
3714
3715         if (gfpflags_allow_blocking(flags)) {
3716                 unsigned int count = size >> PAGE_SHIFT;
3717
3718                 page = dma_alloc_from_contiguous(dev, count, order,
3719                                                  flags & __GFP_NOWARN);
3720         }
3721
3722         if (!page)
3723                 page = alloc_pages(flags, order);
3724         if (!page)
3725                 return NULL;
3726         memset(page_address(page), 0, size);
3727
3728         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3729                                          DMA_BIDIRECTIONAL,
3730                                          dev->coherent_dma_mask);
3731         if (*dma_handle != DMA_MAPPING_ERROR)
3732                 return page_address(page);
3733         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3734                 __free_pages(page, order);
3735
3736         return NULL;
3737 }
3738
3739 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3740                                 dma_addr_t dma_handle, unsigned long attrs)
3741 {
3742         int order;
3743         struct page *page = virt_to_page(vaddr);
3744
3745         size = PAGE_ALIGN(size);
3746         order = get_order(size);
3747
3748         intel_unmap(dev, dma_handle, size);
3749         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3750                 __free_pages(page, order);
3751 }
3752
3753 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3754                            int nelems, enum dma_data_direction dir,
3755                            unsigned long attrs)
3756 {
3757         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3758         unsigned long nrpages = 0;
3759         struct scatterlist *sg;
3760         int i;
3761
3762         for_each_sg(sglist, sg, nelems, i) {
3763                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3764         }
3765
3766         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3767
3768         trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3769 }
3770
3771 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3772                         enum dma_data_direction dir, unsigned long attrs)
3773 {
3774         int i;
3775         struct dmar_domain *domain;
3776         size_t size = 0;
3777         int prot = 0;
3778         unsigned long iova_pfn;
3779         int ret;
3780         struct scatterlist *sg;
3781         unsigned long start_vpfn;
3782         struct intel_iommu *iommu;
3783
3784         BUG_ON(dir == DMA_NONE);
3785
3786         if (unlikely(attach_deferred(dev)))
3787                 do_deferred_attach(dev);
3788
3789         domain = find_domain(dev);
3790         if (!domain)
3791                 return 0;
3792
3793         iommu = domain_get_iommu(domain);
3794
3795         for_each_sg(sglist, sg, nelems, i)
3796                 size += aligned_nrpages(sg->offset, sg->length);
3797
3798         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3799                                 *dev->dma_mask);
3800         if (!iova_pfn) {
3801                 sglist->dma_length = 0;
3802                 return 0;
3803         }
3804
3805         /*
3806          * Check if DMAR supports zero-length reads on write only
3807          * mappings..
3808          */
3809         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3810                         !cap_zlr(iommu->cap))
3811                 prot |= DMA_PTE_READ;
3812         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3813                 prot |= DMA_PTE_WRITE;
3814
3815         start_vpfn = mm_to_dma_pfn(iova_pfn);
3816
3817         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3818         if (unlikely(ret)) {
3819                 dma_pte_free_pagetable(domain, start_vpfn,
3820                                        start_vpfn + size - 1,
3821                                        agaw_to_level(domain->agaw) + 1);
3822                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3823                 return 0;
3824         }
3825
3826         for_each_sg(sglist, sg, nelems, i)
3827                 trace_map_sg(dev, i + 1, nelems, sg);
3828
3829         return nelems;
3830 }
3831
3832 static u64 intel_get_required_mask(struct device *dev)
3833 {
3834         return DMA_BIT_MASK(32);
3835 }
3836
3837 static const struct dma_map_ops intel_dma_ops = {
3838         .alloc = intel_alloc_coherent,
3839         .free = intel_free_coherent,
3840         .map_sg = intel_map_sg,
3841         .unmap_sg = intel_unmap_sg,
3842         .map_page = intel_map_page,
3843         .unmap_page = intel_unmap_page,
3844         .map_resource = intel_map_resource,
3845         .unmap_resource = intel_unmap_resource,
3846         .dma_supported = dma_direct_supported,
3847         .mmap = dma_common_mmap,
3848         .get_sgtable = dma_common_get_sgtable,
3849         .alloc_pages = dma_common_alloc_pages,
3850         .free_pages = dma_common_free_pages,
3851         .get_required_mask = intel_get_required_mask,
3852 };
3853
3854 static void
3855 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3856                    enum dma_data_direction dir, enum dma_sync_target target)
3857 {
3858         struct dmar_domain *domain;
3859         phys_addr_t tlb_addr;
3860
3861         domain = find_domain(dev);
3862         if (WARN_ON(!domain))
3863                 return;
3864
3865         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3866         if (is_swiotlb_buffer(tlb_addr))
3867                 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3868 }
3869
3870 static dma_addr_t
3871 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3872                   enum dma_data_direction dir, unsigned long attrs,
3873                   u64 dma_mask)
3874 {
3875         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3876         struct dmar_domain *domain;
3877         struct intel_iommu *iommu;
3878         unsigned long iova_pfn;
3879         unsigned long nrpages;
3880         phys_addr_t tlb_addr;
3881         int prot = 0;
3882         int ret;
3883
3884         if (unlikely(attach_deferred(dev)))
3885                 do_deferred_attach(dev);
3886
3887         domain = find_domain(dev);
3888
3889         if (WARN_ON(dir == DMA_NONE || !domain))
3890                 return DMA_MAPPING_ERROR;
3891
3892         iommu = domain_get_iommu(domain);
3893         if (WARN_ON(!iommu))
3894                 return DMA_MAPPING_ERROR;
3895
3896         nrpages = aligned_nrpages(0, size);
3897         iova_pfn = intel_alloc_iova(dev, domain,
3898                                     dma_to_mm_pfn(nrpages), dma_mask);
3899         if (!iova_pfn)
3900                 return DMA_MAPPING_ERROR;
3901
3902         /*
3903          * Check if DMAR supports zero-length reads on write only
3904          * mappings..
3905          */
3906         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3907                         !cap_zlr(iommu->cap))
3908                 prot |= DMA_PTE_READ;
3909         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3910                 prot |= DMA_PTE_WRITE;
3911
3912         /*
3913          * If both the physical buffer start address and size are
3914          * page aligned, we don't need to use a bounce page.
3915          */
3916         if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3917                 tlb_addr = swiotlb_tbl_map_single(dev, paddr, size,
3918                                 aligned_size, dir, attrs);
3919                 if (tlb_addr == DMA_MAPPING_ERROR) {
3920                         goto swiotlb_error;
3921                 } else {
3922                         /* Cleanup the padding area. */
3923                         void *padding_start = phys_to_virt(tlb_addr);
3924                         size_t padding_size = aligned_size;
3925
3926                         if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3927                             (dir == DMA_TO_DEVICE ||
3928                              dir == DMA_BIDIRECTIONAL)) {
3929                                 padding_start += size;
3930                                 padding_size -= size;
3931                         }
3932
3933                         memset(padding_start, 0, padding_size);
3934                 }
3935         } else {
3936                 tlb_addr = paddr;
3937         }
3938
3939         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3940                                  tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3941         if (ret)
3942                 goto mapping_error;
3943
3944         trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3945
3946         return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3947
3948 mapping_error:
3949         if (is_swiotlb_buffer(tlb_addr))
3950                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3951                                          aligned_size, dir, attrs);
3952 swiotlb_error:
3953         free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3954         dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3955                 size, (unsigned long long)paddr, dir);
3956
3957         return DMA_MAPPING_ERROR;
3958 }
3959
3960 static void
3961 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3962                     enum dma_data_direction dir, unsigned long attrs)
3963 {
3964         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3965         struct dmar_domain *domain;
3966         phys_addr_t tlb_addr;
3967
3968         domain = find_domain(dev);
3969         if (WARN_ON(!domain))
3970                 return;
3971
3972         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3973         if (WARN_ON(!tlb_addr))
3974                 return;
3975
3976         intel_unmap(dev, dev_addr, size);
3977         if (is_swiotlb_buffer(tlb_addr))
3978                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3979                                          aligned_size, dir, attrs);
3980
3981         trace_bounce_unmap_single(dev, dev_addr, size);
3982 }
3983
3984 static dma_addr_t
3985 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3986                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3987 {
3988         return bounce_map_single(dev, page_to_phys(page) + offset,
3989                                  size, dir, attrs, *dev->dma_mask);
3990 }
3991
3992 static dma_addr_t
3993 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3994                     enum dma_data_direction dir, unsigned long attrs)
3995 {
3996         return bounce_map_single(dev, phys_addr, size,
3997                                  dir, attrs, *dev->dma_mask);
3998 }
3999
4000 static void
4001 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
4002                   enum dma_data_direction dir, unsigned long attrs)
4003 {
4004         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4005 }
4006
4007 static void
4008 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
4009                       enum dma_data_direction dir, unsigned long attrs)
4010 {
4011         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4012 }
4013
4014 static void
4015 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4016                 enum dma_data_direction dir, unsigned long attrs)
4017 {
4018         struct scatterlist *sg;
4019         int i;
4020
4021         for_each_sg(sglist, sg, nelems, i)
4022                 bounce_unmap_page(dev, sg->dma_address,
4023                                   sg_dma_len(sg), dir, attrs);
4024 }
4025
4026 static int
4027 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4028               enum dma_data_direction dir, unsigned long attrs)
4029 {
4030         int i;
4031         struct scatterlist *sg;
4032
4033         for_each_sg(sglist, sg, nelems, i) {
4034                 sg->dma_address = bounce_map_page(dev, sg_page(sg),
4035                                                   sg->offset, sg->length,
4036                                                   dir, attrs);
4037                 if (sg->dma_address == DMA_MAPPING_ERROR)
4038                         goto out_unmap;
4039                 sg_dma_len(sg) = sg->length;
4040         }
4041
4042         for_each_sg(sglist, sg, nelems, i)
4043                 trace_bounce_map_sg(dev, i + 1, nelems, sg);
4044
4045         return nelems;
4046
4047 out_unmap:
4048         bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
4049         return 0;
4050 }
4051
4052 static void
4053 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
4054                            size_t size, enum dma_data_direction dir)
4055 {
4056         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
4057 }
4058
4059 static void
4060 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4061                               size_t size, enum dma_data_direction dir)
4062 {
4063         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4064 }
4065
4066 static void
4067 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4068                        int nelems, enum dma_data_direction dir)
4069 {
4070         struct scatterlist *sg;
4071         int i;
4072
4073         for_each_sg(sglist, sg, nelems, i)
4074                 bounce_sync_single(dev, sg_dma_address(sg),
4075                                    sg_dma_len(sg), dir, SYNC_FOR_CPU);
4076 }
4077
4078 static void
4079 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4080                           int nelems, enum dma_data_direction dir)
4081 {
4082         struct scatterlist *sg;
4083         int i;
4084
4085         for_each_sg(sglist, sg, nelems, i)
4086                 bounce_sync_single(dev, sg_dma_address(sg),
4087                                    sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4088 }
4089
4090 static const struct dma_map_ops bounce_dma_ops = {
4091         .alloc                  = intel_alloc_coherent,
4092         .free                   = intel_free_coherent,
4093         .map_sg                 = bounce_map_sg,
4094         .unmap_sg               = bounce_unmap_sg,
4095         .map_page               = bounce_map_page,
4096         .unmap_page             = bounce_unmap_page,
4097         .sync_single_for_cpu    = bounce_sync_single_for_cpu,
4098         .sync_single_for_device = bounce_sync_single_for_device,
4099         .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
4100         .sync_sg_for_device     = bounce_sync_sg_for_device,
4101         .map_resource           = bounce_map_resource,
4102         .unmap_resource         = bounce_unmap_resource,
4103         .alloc_pages            = dma_common_alloc_pages,
4104         .free_pages             = dma_common_free_pages,
4105         .dma_supported          = dma_direct_supported,
4106 };
4107
4108 static inline int iommu_domain_cache_init(void)
4109 {
4110         int ret = 0;
4111
4112         iommu_domain_cache = kmem_cache_create("iommu_domain",
4113                                          sizeof(struct dmar_domain),
4114                                          0,
4115                                          SLAB_HWCACHE_ALIGN,
4116
4117                                          NULL);
4118         if (!iommu_domain_cache) {
4119                 pr_err("Couldn't create iommu_domain cache\n");
4120                 ret = -ENOMEM;
4121         }
4122
4123         return ret;
4124 }
4125
4126 static inline int iommu_devinfo_cache_init(void)
4127 {
4128         int ret = 0;
4129
4130         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4131                                          sizeof(struct device_domain_info),
4132                                          0,
4133                                          SLAB_HWCACHE_ALIGN,
4134                                          NULL);
4135         if (!iommu_devinfo_cache) {
4136                 pr_err("Couldn't create devinfo cache\n");
4137                 ret = -ENOMEM;
4138         }
4139
4140         return ret;
4141 }
4142
4143 static int __init iommu_init_mempool(void)
4144 {
4145         int ret;
4146         ret = iova_cache_get();
4147         if (ret)
4148                 return ret;
4149
4150         ret = iommu_domain_cache_init();
4151         if (ret)
4152                 goto domain_error;
4153
4154         ret = iommu_devinfo_cache_init();
4155         if (!ret)
4156                 return ret;
4157
4158         kmem_cache_destroy(iommu_domain_cache);
4159 domain_error:
4160         iova_cache_put();
4161
4162         return -ENOMEM;
4163 }
4164
4165 static void __init iommu_exit_mempool(void)
4166 {
4167         kmem_cache_destroy(iommu_devinfo_cache);
4168         kmem_cache_destroy(iommu_domain_cache);
4169         iova_cache_put();
4170 }
4171
4172 static void __init init_no_remapping_devices(void)
4173 {
4174         struct dmar_drhd_unit *drhd;
4175         struct device *dev;
4176         int i;
4177
4178         for_each_drhd_unit(drhd) {
4179                 if (!drhd->include_all) {
4180                         for_each_active_dev_scope(drhd->devices,
4181                                                   drhd->devices_cnt, i, dev)
4182                                 break;
4183                         /* ignore DMAR unit if no devices exist */
4184                         if (i == drhd->devices_cnt)
4185                                 drhd->ignored = 1;
4186                 }
4187         }
4188
4189         for_each_active_drhd_unit(drhd) {
4190                 if (drhd->include_all)
4191                         continue;
4192
4193                 for_each_active_dev_scope(drhd->devices,
4194                                           drhd->devices_cnt, i, dev)
4195                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4196                                 break;
4197                 if (i < drhd->devices_cnt)
4198                         continue;
4199
4200                 /* This IOMMU has *only* gfx devices. Either bypass it or
4201                    set the gfx_mapped flag, as appropriate */
4202                 drhd->gfx_dedicated = 1;
4203                 if (!dmar_map_gfx)
4204                         drhd->ignored = 1;
4205         }
4206 }
4207
4208 #ifdef CONFIG_SUSPEND
4209 static int init_iommu_hw(void)
4210 {
4211         struct dmar_drhd_unit *drhd;
4212         struct intel_iommu *iommu = NULL;
4213
4214         for_each_active_iommu(iommu, drhd)
4215                 if (iommu->qi)
4216                         dmar_reenable_qi(iommu);
4217
4218         for_each_iommu(iommu, drhd) {
4219                 if (drhd->ignored) {
4220                         /*
4221                          * we always have to disable PMRs or DMA may fail on
4222                          * this device
4223                          */
4224                         if (force_on)
4225                                 iommu_disable_protect_mem_regions(iommu);
4226                         continue;
4227                 }
4228
4229                 iommu_flush_write_buffer(iommu);
4230                 iommu_set_root_entry(iommu);
4231                 iommu_enable_translation(iommu);
4232                 iommu_disable_protect_mem_regions(iommu);
4233         }
4234
4235         return 0;
4236 }
4237
4238 static void iommu_flush_all(void)
4239 {
4240         struct dmar_drhd_unit *drhd;
4241         struct intel_iommu *iommu;
4242
4243         for_each_active_iommu(iommu, drhd) {
4244                 iommu->flush.flush_context(iommu, 0, 0, 0,
4245                                            DMA_CCMD_GLOBAL_INVL);
4246                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4247                                          DMA_TLB_GLOBAL_FLUSH);
4248         }
4249 }
4250
4251 static int iommu_suspend(void)
4252 {
4253         struct dmar_drhd_unit *drhd;
4254         struct intel_iommu *iommu = NULL;
4255         unsigned long flag;
4256
4257         for_each_active_iommu(iommu, drhd) {
4258                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4259                                                  GFP_ATOMIC);
4260                 if (!iommu->iommu_state)
4261                         goto nomem;
4262         }
4263
4264         iommu_flush_all();
4265
4266         for_each_active_iommu(iommu, drhd) {
4267                 iommu_disable_translation(iommu);
4268
4269                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4270
4271                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4272                         readl(iommu->reg + DMAR_FECTL_REG);
4273                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4274                         readl(iommu->reg + DMAR_FEDATA_REG);
4275                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4276                         readl(iommu->reg + DMAR_FEADDR_REG);
4277                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4278                         readl(iommu->reg + DMAR_FEUADDR_REG);
4279
4280                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4281         }
4282         return 0;
4283
4284 nomem:
4285         for_each_active_iommu(iommu, drhd)
4286                 kfree(iommu->iommu_state);
4287
4288         return -ENOMEM;
4289 }
4290
4291 static void iommu_resume(void)
4292 {
4293         struct dmar_drhd_unit *drhd;
4294         struct intel_iommu *iommu = NULL;
4295         unsigned long flag;
4296
4297         if (init_iommu_hw()) {
4298                 if (force_on)
4299                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4300                 else
4301                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4302                 return;
4303         }
4304
4305         for_each_active_iommu(iommu, drhd) {
4306
4307                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4308
4309                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4310                         iommu->reg + DMAR_FECTL_REG);
4311                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4312                         iommu->reg + DMAR_FEDATA_REG);
4313                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4314                         iommu->reg + DMAR_FEADDR_REG);
4315                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4316                         iommu->reg + DMAR_FEUADDR_REG);
4317
4318                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4319         }
4320
4321         for_each_active_iommu(iommu, drhd)
4322                 kfree(iommu->iommu_state);
4323 }
4324
4325 static struct syscore_ops iommu_syscore_ops = {
4326         .resume         = iommu_resume,
4327         .suspend        = iommu_suspend,
4328 };
4329
4330 static void __init init_iommu_pm_ops(void)
4331 {
4332         register_syscore_ops(&iommu_syscore_ops);
4333 }
4334
4335 #else
4336 static inline void init_iommu_pm_ops(void) {}
4337 #endif  /* CONFIG_PM */
4338
4339 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4340 {
4341         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4342             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4343             rmrr->end_address <= rmrr->base_address ||
4344             arch_rmrr_sanity_check(rmrr))
4345                 return -EINVAL;
4346
4347         return 0;
4348 }
4349
4350 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4351 {
4352         struct acpi_dmar_reserved_memory *rmrr;
4353         struct dmar_rmrr_unit *rmrru;
4354
4355         rmrr = (struct acpi_dmar_reserved_memory *)header;
4356         if (rmrr_sanity_check(rmrr)) {
4357                 pr_warn(FW_BUG
4358                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4359                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4360                            rmrr->base_address, rmrr->end_address,
4361                            dmi_get_system_info(DMI_BIOS_VENDOR),
4362                            dmi_get_system_info(DMI_BIOS_VERSION),
4363                            dmi_get_system_info(DMI_PRODUCT_VERSION));
4364                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4365         }
4366
4367         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4368         if (!rmrru)
4369                 goto out;
4370
4371         rmrru->hdr = header;
4372
4373         rmrru->base_address = rmrr->base_address;
4374         rmrru->end_address = rmrr->end_address;
4375
4376         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4377                                 ((void *)rmrr) + rmrr->header.length,
4378                                 &rmrru->devices_cnt);
4379         if (rmrru->devices_cnt && rmrru->devices == NULL)
4380                 goto free_rmrru;
4381
4382         list_add(&rmrru->list, &dmar_rmrr_units);
4383
4384         return 0;
4385 free_rmrru:
4386         kfree(rmrru);
4387 out:
4388         return -ENOMEM;
4389 }
4390
4391 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4392 {
4393         struct dmar_atsr_unit *atsru;
4394         struct acpi_dmar_atsr *tmp;
4395
4396         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4397                                 dmar_rcu_check()) {
4398                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4399                 if (atsr->segment != tmp->segment)
4400                         continue;
4401                 if (atsr->header.length != tmp->header.length)
4402                         continue;
4403                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4404                         return atsru;
4405         }
4406
4407         return NULL;
4408 }
4409
4410 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4411 {
4412         struct acpi_dmar_atsr *atsr;
4413         struct dmar_atsr_unit *atsru;
4414
4415         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4416                 return 0;
4417
4418         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4419         atsru = dmar_find_atsr(atsr);
4420         if (atsru)
4421                 return 0;
4422
4423         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4424         if (!atsru)
4425                 return -ENOMEM;
4426
4427         /*
4428          * If memory is allocated from slab by ACPI _DSM method, we need to
4429          * copy the memory content because the memory buffer will be freed
4430          * on return.
4431          */
4432         atsru->hdr = (void *)(atsru + 1);
4433         memcpy(atsru->hdr, hdr, hdr->length);
4434         atsru->include_all = atsr->flags & 0x1;
4435         if (!atsru->include_all) {
4436                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4437                                 (void *)atsr + atsr->header.length,
4438                                 &atsru->devices_cnt);
4439                 if (atsru->devices_cnt && atsru->devices == NULL) {
4440                         kfree(atsru);
4441                         return -ENOMEM;
4442                 }
4443         }
4444
4445         list_add_rcu(&atsru->list, &dmar_atsr_units);
4446
4447         return 0;
4448 }
4449
4450 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4451 {
4452         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4453         kfree(atsru);
4454 }
4455
4456 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4457 {
4458         struct acpi_dmar_atsr *atsr;
4459         struct dmar_atsr_unit *atsru;
4460
4461         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4462         atsru = dmar_find_atsr(atsr);
4463         if (atsru) {
4464                 list_del_rcu(&atsru->list);
4465                 synchronize_rcu();
4466                 intel_iommu_free_atsr(atsru);
4467         }
4468
4469         return 0;
4470 }
4471
4472 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4473 {
4474         int i;
4475         struct device *dev;
4476         struct acpi_dmar_atsr *atsr;
4477         struct dmar_atsr_unit *atsru;
4478
4479         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4480         atsru = dmar_find_atsr(atsr);
4481         if (!atsru)
4482                 return 0;
4483
4484         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4485                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4486                                           i, dev)
4487                         return -EBUSY;
4488         }
4489
4490         return 0;
4491 }
4492
4493 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4494 {
4495         int sp, ret;
4496         struct intel_iommu *iommu = dmaru->iommu;
4497
4498         if (g_iommus[iommu->seq_id])
4499                 return 0;
4500
4501         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4502                 pr_warn("%s: Doesn't support hardware pass through.\n",
4503                         iommu->name);
4504                 return -ENXIO;
4505         }
4506         if (!ecap_sc_support(iommu->ecap) &&
4507             domain_update_iommu_snooping(iommu)) {
4508                 pr_warn("%s: Doesn't support snooping.\n",
4509                         iommu->name);
4510                 return -ENXIO;
4511         }
4512         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4513         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4514                 pr_warn("%s: Doesn't support large page.\n",
4515                         iommu->name);
4516                 return -ENXIO;
4517         }
4518
4519         /*
4520          * Disable translation if already enabled prior to OS handover.
4521          */
4522         if (iommu->gcmd & DMA_GCMD_TE)
4523                 iommu_disable_translation(iommu);
4524
4525         g_iommus[iommu->seq_id] = iommu;
4526         ret = iommu_init_domains(iommu);
4527         if (ret == 0)
4528                 ret = iommu_alloc_root_entry(iommu);
4529         if (ret)
4530                 goto out;
4531
4532         intel_svm_check(iommu);
4533
4534         if (dmaru->ignored) {
4535                 /*
4536                  * we always have to disable PMRs or DMA may fail on this device
4537                  */
4538                 if (force_on)
4539                         iommu_disable_protect_mem_regions(iommu);
4540                 return 0;
4541         }
4542
4543         intel_iommu_init_qi(iommu);
4544         iommu_flush_write_buffer(iommu);
4545
4546 #ifdef CONFIG_INTEL_IOMMU_SVM
4547         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4548                 ret = intel_svm_enable_prq(iommu);
4549                 if (ret)
4550                         goto disable_iommu;
4551         }
4552 #endif
4553         ret = dmar_set_interrupt(iommu);
4554         if (ret)
4555                 goto disable_iommu;
4556
4557         iommu_set_root_entry(iommu);
4558         iommu_enable_translation(iommu);
4559
4560         iommu_disable_protect_mem_regions(iommu);
4561         return 0;
4562
4563 disable_iommu:
4564         disable_dmar_iommu(iommu);
4565 out:
4566         free_dmar_iommu(iommu);
4567         return ret;
4568 }
4569
4570 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4571 {
4572         int ret = 0;
4573         struct intel_iommu *iommu = dmaru->iommu;
4574
4575         if (!intel_iommu_enabled)
4576                 return 0;
4577         if (iommu == NULL)
4578                 return -EINVAL;
4579
4580         if (insert) {
4581                 ret = intel_iommu_add(dmaru);
4582         } else {
4583                 disable_dmar_iommu(iommu);
4584                 free_dmar_iommu(iommu);
4585         }
4586
4587         return ret;
4588 }
4589
4590 static void intel_iommu_free_dmars(void)
4591 {
4592         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4593         struct dmar_atsr_unit *atsru, *atsr_n;
4594
4595         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4596                 list_del(&rmrru->list);
4597                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4598                 kfree(rmrru);
4599         }
4600
4601         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4602                 list_del(&atsru->list);
4603                 intel_iommu_free_atsr(atsru);
4604         }
4605 }
4606
4607 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4608 {
4609         int i, ret = 1;
4610         struct pci_bus *bus;
4611         struct pci_dev *bridge = NULL;
4612         struct device *tmp;
4613         struct acpi_dmar_atsr *atsr;
4614         struct dmar_atsr_unit *atsru;
4615
4616         dev = pci_physfn(dev);
4617         for (bus = dev->bus; bus; bus = bus->parent) {
4618                 bridge = bus->self;
4619                 /* If it's an integrated device, allow ATS */
4620                 if (!bridge)
4621                         return 1;
4622                 /* Connected via non-PCIe: no ATS */
4623                 if (!pci_is_pcie(bridge) ||
4624                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4625                         return 0;
4626                 /* If we found the root port, look it up in the ATSR */
4627                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4628                         break;
4629         }
4630
4631         rcu_read_lock();
4632         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4633                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4634                 if (atsr->segment != pci_domain_nr(dev->bus))
4635                         continue;
4636
4637                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4638                         if (tmp == &bridge->dev)
4639                                 goto out;
4640
4641                 if (atsru->include_all)
4642                         goto out;
4643         }
4644         ret = 0;
4645 out:
4646         rcu_read_unlock();
4647
4648         return ret;
4649 }
4650
4651 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4652 {
4653         int ret;
4654         struct dmar_rmrr_unit *rmrru;
4655         struct dmar_atsr_unit *atsru;
4656         struct acpi_dmar_atsr *atsr;
4657         struct acpi_dmar_reserved_memory *rmrr;
4658
4659         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4660                 return 0;
4661
4662         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4663                 rmrr = container_of(rmrru->hdr,
4664                                     struct acpi_dmar_reserved_memory, header);
4665                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4666                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4667                                 ((void *)rmrr) + rmrr->header.length,
4668                                 rmrr->segment, rmrru->devices,
4669                                 rmrru->devices_cnt);
4670                         if (ret < 0)
4671                                 return ret;
4672                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4673                         dmar_remove_dev_scope(info, rmrr->segment,
4674                                 rmrru->devices, rmrru->devices_cnt);
4675                 }
4676         }
4677
4678         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4679                 if (atsru->include_all)
4680                         continue;
4681
4682                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4683                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4684                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4685                                         (void *)atsr + atsr->header.length,
4686                                         atsr->segment, atsru->devices,
4687                                         atsru->devices_cnt);
4688                         if (ret > 0)
4689                                 break;
4690                         else if (ret < 0)
4691                                 return ret;
4692                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4693                         if (dmar_remove_dev_scope(info, atsr->segment,
4694                                         atsru->devices, atsru->devices_cnt))
4695                                 break;
4696                 }
4697         }
4698
4699         return 0;
4700 }
4701
4702 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4703                                        unsigned long val, void *v)
4704 {
4705         struct memory_notify *mhp = v;
4706         unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4707         unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4708                         mhp->nr_pages - 1);
4709
4710         switch (val) {
4711         case MEM_GOING_ONLINE:
4712                 if (iommu_domain_identity_map(si_domain,
4713                                               start_vpfn, last_vpfn)) {
4714                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
4715                                 start_vpfn, last_vpfn);
4716                         return NOTIFY_BAD;
4717                 }
4718                 break;
4719
4720         case MEM_OFFLINE:
4721         case MEM_CANCEL_ONLINE:
4722                 {
4723                         struct dmar_drhd_unit *drhd;
4724                         struct intel_iommu *iommu;
4725                         struct page *freelist;
4726
4727                         freelist = domain_unmap(si_domain,
4728                                                 start_vpfn, last_vpfn);
4729
4730                         rcu_read_lock();
4731                         for_each_active_iommu(iommu, drhd)
4732                                 iommu_flush_iotlb_psi(iommu, si_domain,
4733                                         start_vpfn, mhp->nr_pages,
4734                                         !freelist, 0);
4735                         rcu_read_unlock();
4736                         dma_free_pagelist(freelist);
4737                 }
4738                 break;
4739         }
4740
4741         return NOTIFY_OK;
4742 }
4743
4744 static struct notifier_block intel_iommu_memory_nb = {
4745         .notifier_call = intel_iommu_memory_notifier,
4746         .priority = 0
4747 };
4748
4749 static void free_all_cpu_cached_iovas(unsigned int cpu)
4750 {
4751         int i;
4752
4753         for (i = 0; i < g_num_of_iommus; i++) {
4754                 struct intel_iommu *iommu = g_iommus[i];
4755                 struct dmar_domain *domain;
4756                 int did;
4757
4758                 if (!iommu)
4759                         continue;
4760
4761                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4762                         domain = get_iommu_domain(iommu, (u16)did);
4763
4764                         if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4765                                 continue;
4766
4767                         free_cpu_cached_iovas(cpu, &domain->iovad);
4768                 }
4769         }
4770 }
4771
4772 static int intel_iommu_cpu_dead(unsigned int cpu)
4773 {
4774         free_all_cpu_cached_iovas(cpu);
4775         return 0;
4776 }
4777
4778 static void intel_disable_iommus(void)
4779 {
4780         struct intel_iommu *iommu = NULL;
4781         struct dmar_drhd_unit *drhd;
4782
4783         for_each_iommu(iommu, drhd)
4784                 iommu_disable_translation(iommu);
4785 }
4786
4787 void intel_iommu_shutdown(void)
4788 {
4789         struct dmar_drhd_unit *drhd;
4790         struct intel_iommu *iommu = NULL;
4791
4792         if (no_iommu || dmar_disabled)
4793                 return;
4794
4795         down_write(&dmar_global_lock);
4796
4797         /* Disable PMRs explicitly here. */
4798         for_each_iommu(iommu, drhd)
4799                 iommu_disable_protect_mem_regions(iommu);
4800
4801         /* Make sure the IOMMUs are switched off */
4802         intel_disable_iommus();
4803
4804         up_write(&dmar_global_lock);
4805 }
4806
4807 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4808 {
4809         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4810
4811         return container_of(iommu_dev, struct intel_iommu, iommu);
4812 }
4813
4814 static ssize_t intel_iommu_show_version(struct device *dev,
4815                                         struct device_attribute *attr,
4816                                         char *buf)
4817 {
4818         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4819         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4820         return sprintf(buf, "%d:%d\n",
4821                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4822 }
4823 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4824
4825 static ssize_t intel_iommu_show_address(struct device *dev,
4826                                         struct device_attribute *attr,
4827                                         char *buf)
4828 {
4829         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4830         return sprintf(buf, "%llx\n", iommu->reg_phys);
4831 }
4832 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4833
4834 static ssize_t intel_iommu_show_cap(struct device *dev,
4835                                     struct device_attribute *attr,
4836                                     char *buf)
4837 {
4838         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4839         return sprintf(buf, "%llx\n", iommu->cap);
4840 }
4841 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4842
4843 static ssize_t intel_iommu_show_ecap(struct device *dev,
4844                                     struct device_attribute *attr,
4845                                     char *buf)
4846 {
4847         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4848         return sprintf(buf, "%llx\n", iommu->ecap);
4849 }
4850 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4851
4852 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4853                                       struct device_attribute *attr,
4854                                       char *buf)
4855 {
4856         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4857         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4858 }
4859 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4860
4861 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4862                                            struct device_attribute *attr,
4863                                            char *buf)
4864 {
4865         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4866         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4867                                                   cap_ndoms(iommu->cap)));
4868 }
4869 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4870
4871 static struct attribute *intel_iommu_attrs[] = {
4872         &dev_attr_version.attr,
4873         &dev_attr_address.attr,
4874         &dev_attr_cap.attr,
4875         &dev_attr_ecap.attr,
4876         &dev_attr_domains_supported.attr,
4877         &dev_attr_domains_used.attr,
4878         NULL,
4879 };
4880
4881 static struct attribute_group intel_iommu_group = {
4882         .name = "intel-iommu",
4883         .attrs = intel_iommu_attrs,
4884 };
4885
4886 const struct attribute_group *intel_iommu_groups[] = {
4887         &intel_iommu_group,
4888         NULL,
4889 };
4890
4891 static inline bool has_external_pci(void)
4892 {
4893         struct pci_dev *pdev = NULL;
4894
4895         for_each_pci_dev(pdev)
4896                 if (pdev->external_facing)
4897                         return true;
4898
4899         return false;
4900 }
4901
4902 static int __init platform_optin_force_iommu(void)
4903 {
4904         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4905                 return 0;
4906
4907         if (no_iommu || dmar_disabled)
4908                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4909
4910         /*
4911          * If Intel-IOMMU is disabled by default, we will apply identity
4912          * map for all devices except those marked as being untrusted.
4913          */
4914         if (dmar_disabled)
4915                 iommu_set_default_passthrough(false);
4916
4917         dmar_disabled = 0;
4918         no_iommu = 0;
4919
4920         return 1;
4921 }
4922
4923 static int __init probe_acpi_namespace_devices(void)
4924 {
4925         struct dmar_drhd_unit *drhd;
4926         /* To avoid a -Wunused-but-set-variable warning. */
4927         struct intel_iommu *iommu __maybe_unused;
4928         struct device *dev;
4929         int i, ret = 0;
4930
4931         for_each_active_iommu(iommu, drhd) {
4932                 for_each_active_dev_scope(drhd->devices,
4933                                           drhd->devices_cnt, i, dev) {
4934                         struct acpi_device_physical_node *pn;
4935                         struct iommu_group *group;
4936                         struct acpi_device *adev;
4937
4938                         if (dev->bus != &acpi_bus_type)
4939                                 continue;
4940
4941                         adev = to_acpi_device(dev);
4942                         mutex_lock(&adev->physical_node_lock);
4943                         list_for_each_entry(pn,
4944                                             &adev->physical_node_list, node) {
4945                                 group = iommu_group_get(pn->dev);
4946                                 if (group) {
4947                                         iommu_group_put(group);
4948                                         continue;
4949                                 }
4950
4951                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4952                                 ret = iommu_probe_device(pn->dev);
4953                                 if (ret)
4954                                         break;
4955                         }
4956                         mutex_unlock(&adev->physical_node_lock);
4957
4958                         if (ret)
4959                                 return ret;
4960                 }
4961         }
4962
4963         return 0;
4964 }
4965
4966 int __init intel_iommu_init(void)
4967 {
4968         int ret = -ENODEV;
4969         struct dmar_drhd_unit *drhd;
4970         struct intel_iommu *iommu;
4971
4972         /*
4973          * Intel IOMMU is required for a TXT/tboot launch or platform
4974          * opt in, so enforce that.
4975          */
4976         force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4977                     platform_optin_force_iommu();
4978
4979         if (iommu_init_mempool()) {
4980                 if (force_on)
4981                         panic("tboot: Failed to initialize iommu memory\n");
4982                 return -ENOMEM;
4983         }
4984
4985         down_write(&dmar_global_lock);
4986         if (dmar_table_init()) {
4987                 if (force_on)
4988                         panic("tboot: Failed to initialize DMAR table\n");
4989                 goto out_free_dmar;
4990         }
4991
4992         if (dmar_dev_scope_init() < 0) {
4993                 if (force_on)
4994                         panic("tboot: Failed to initialize DMAR device scope\n");
4995                 goto out_free_dmar;
4996         }
4997
4998         up_write(&dmar_global_lock);
4999
5000         /*
5001          * The bus notifier takes the dmar_global_lock, so lockdep will
5002          * complain later when we register it under the lock.
5003          */
5004         dmar_register_bus_notifier();
5005
5006         down_write(&dmar_global_lock);
5007
5008         if (!no_iommu)
5009                 intel_iommu_debugfs_init();
5010
5011         if (no_iommu || dmar_disabled) {
5012                 /*
5013                  * We exit the function here to ensure IOMMU's remapping and
5014                  * mempool aren't setup, which means that the IOMMU's PMRs
5015                  * won't be disabled via the call to init_dmars(). So disable
5016                  * it explicitly here. The PMRs were setup by tboot prior to
5017                  * calling SENTER, but the kernel is expected to reset/tear
5018                  * down the PMRs.
5019                  */
5020                 if (intel_iommu_tboot_noforce) {
5021                         for_each_iommu(iommu, drhd)
5022                                 iommu_disable_protect_mem_regions(iommu);
5023                 }
5024
5025                 /*
5026                  * Make sure the IOMMUs are switched off, even when we
5027                  * boot into a kexec kernel and the previous kernel left
5028                  * them enabled
5029                  */
5030                 intel_disable_iommus();
5031                 goto out_free_dmar;
5032         }
5033
5034         if (list_empty(&dmar_rmrr_units))
5035                 pr_info("No RMRR found\n");
5036
5037         if (list_empty(&dmar_atsr_units))
5038                 pr_info("No ATSR found\n");
5039
5040         if (dmar_init_reserved_ranges()) {
5041                 if (force_on)
5042                         panic("tboot: Failed to reserve iommu ranges\n");
5043                 goto out_free_reserved_range;
5044         }
5045
5046         if (dmar_map_gfx)
5047                 intel_iommu_gfx_mapped = 1;
5048
5049         init_no_remapping_devices();
5050
5051         ret = init_dmars();
5052         if (ret) {
5053                 if (force_on)
5054                         panic("tboot: Failed to initialize DMARs\n");
5055                 pr_err("Initialization failed\n");
5056                 goto out_free_reserved_range;
5057         }
5058         up_write(&dmar_global_lock);
5059
5060         init_iommu_pm_ops();
5061
5062         down_read(&dmar_global_lock);
5063         for_each_active_iommu(iommu, drhd) {
5064                 iommu_device_sysfs_add(&iommu->iommu, NULL,
5065                                        intel_iommu_groups,
5066                                        "%s", iommu->name);
5067                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5068                 iommu_device_register(&iommu->iommu);
5069         }
5070         up_read(&dmar_global_lock);
5071
5072         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5073         if (si_domain && !hw_pass_through)
5074                 register_memory_notifier(&intel_iommu_memory_nb);
5075         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5076                           intel_iommu_cpu_dead);
5077
5078         down_read(&dmar_global_lock);
5079         if (probe_acpi_namespace_devices())
5080                 pr_warn("ACPI name space devices didn't probe correctly\n");
5081
5082         /* Finally, we enable the DMA remapping hardware. */
5083         for_each_iommu(iommu, drhd) {
5084                 if (!drhd->ignored && !translation_pre_enabled(iommu))
5085                         iommu_enable_translation(iommu);
5086
5087                 iommu_disable_protect_mem_regions(iommu);
5088         }
5089         up_read(&dmar_global_lock);
5090
5091         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5092
5093         intel_iommu_enabled = 1;
5094
5095         return 0;
5096
5097 out_free_reserved_range:
5098         put_iova_domain(&reserved_iova_list);
5099 out_free_dmar:
5100         intel_iommu_free_dmars();
5101         up_write(&dmar_global_lock);
5102         iommu_exit_mempool();
5103         return ret;
5104 }
5105
5106 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5107 {
5108         struct intel_iommu *iommu = opaque;
5109
5110         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5111         return 0;
5112 }
5113
5114 /*
5115  * NB - intel-iommu lacks any sort of reference counting for the users of
5116  * dependent devices.  If multiple endpoints have intersecting dependent
5117  * devices, unbinding the driver from any one of them will possibly leave
5118  * the others unable to operate.
5119  */
5120 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5121 {
5122         if (!iommu || !dev || !dev_is_pci(dev))
5123                 return;
5124
5125         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5126 }
5127
5128 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5129 {
5130         struct dmar_domain *domain;
5131         struct intel_iommu *iommu;
5132         unsigned long flags;
5133
5134         assert_spin_locked(&device_domain_lock);
5135
5136         if (WARN_ON(!info))
5137                 return;
5138
5139         iommu = info->iommu;
5140         domain = info->domain;
5141
5142         if (info->dev) {
5143                 if (dev_is_pci(info->dev) && sm_supported(iommu))
5144                         intel_pasid_tear_down_entry(iommu, info->dev,
5145                                         PASID_RID2PASID, false);
5146
5147                 iommu_disable_dev_iotlb(info);
5148                 if (!dev_is_real_dma_subdevice(info->dev))
5149                         domain_context_clear(iommu, info->dev);
5150                 intel_pasid_free_table(info->dev);
5151         }
5152
5153         unlink_domain_info(info);
5154
5155         spin_lock_irqsave(&iommu->lock, flags);
5156         domain_detach_iommu(domain, iommu);
5157         spin_unlock_irqrestore(&iommu->lock, flags);
5158
5159         free_devinfo_mem(info);
5160 }
5161
5162 static void dmar_remove_one_dev_info(struct device *dev)
5163 {
5164         struct device_domain_info *info;
5165         unsigned long flags;
5166
5167         spin_lock_irqsave(&device_domain_lock, flags);
5168         info = get_domain_info(dev);
5169         if (info)
5170                 __dmar_remove_one_dev_info(info);
5171         spin_unlock_irqrestore(&device_domain_lock, flags);
5172 }
5173
5174 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5175 {
5176         int adjust_width;
5177
5178         /* calculate AGAW */
5179         domain->gaw = guest_width;
5180         adjust_width = guestwidth_to_adjustwidth(guest_width);
5181         domain->agaw = width_to_agaw(adjust_width);
5182
5183         domain->iommu_coherency = 0;
5184         domain->iommu_snooping = 0;
5185         domain->iommu_superpage = 0;
5186         domain->max_addr = 0;
5187
5188         /* always allocate the top pgd */
5189         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5190         if (!domain->pgd)
5191                 return -ENOMEM;
5192         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5193         return 0;
5194 }
5195
5196 static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
5197 {
5198         init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5199         copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
5200
5201         if (!intel_iommu_strict &&
5202             init_iova_flush_queue(&dmar_domain->iovad,
5203                                   iommu_flush_iova, iova_entry_free))
5204                 pr_info("iova flush queue initialization failed\n");
5205 }
5206
5207 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5208 {
5209         struct dmar_domain *dmar_domain;
5210         struct iommu_domain *domain;
5211
5212         switch (type) {
5213         case IOMMU_DOMAIN_DMA:
5214         case IOMMU_DOMAIN_UNMANAGED:
5215                 dmar_domain = alloc_domain(0);
5216                 if (!dmar_domain) {
5217                         pr_err("Can't allocate dmar_domain\n");
5218                         return NULL;
5219                 }
5220                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5221                         pr_err("Domain initialization failed\n");
5222                         domain_exit(dmar_domain);
5223                         return NULL;
5224                 }
5225
5226                 if (type == IOMMU_DOMAIN_DMA)
5227                         intel_init_iova_domain(dmar_domain);
5228
5229                 domain = &dmar_domain->domain;
5230                 domain->geometry.aperture_start = 0;
5231                 domain->geometry.aperture_end   =
5232                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5233                 domain->geometry.force_aperture = true;
5234
5235                 return domain;
5236         case IOMMU_DOMAIN_IDENTITY:
5237                 return &si_domain->domain;
5238         default:
5239                 return NULL;
5240         }
5241
5242         return NULL;
5243 }
5244
5245 static void intel_iommu_domain_free(struct iommu_domain *domain)
5246 {
5247         if (domain != &si_domain->domain)
5248                 domain_exit(to_dmar_domain(domain));
5249 }
5250
5251 /*
5252  * Check whether a @domain could be attached to the @dev through the
5253  * aux-domain attach/detach APIs.
5254  */
5255 static inline bool
5256 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5257 {
5258         struct device_domain_info *info = get_domain_info(dev);
5259
5260         return info && info->auxd_enabled &&
5261                         domain->type == IOMMU_DOMAIN_UNMANAGED;
5262 }
5263
5264 static void auxiliary_link_device(struct dmar_domain *domain,
5265                                   struct device *dev)
5266 {
5267         struct device_domain_info *info = get_domain_info(dev);
5268
5269         assert_spin_locked(&device_domain_lock);
5270         if (WARN_ON(!info))
5271                 return;
5272
5273         domain->auxd_refcnt++;
5274         list_add(&domain->auxd, &info->auxiliary_domains);
5275 }
5276
5277 static void auxiliary_unlink_device(struct dmar_domain *domain,
5278                                     struct device *dev)
5279 {
5280         struct device_domain_info *info = get_domain_info(dev);
5281
5282         assert_spin_locked(&device_domain_lock);
5283         if (WARN_ON(!info))
5284                 return;
5285
5286         list_del(&domain->auxd);
5287         domain->auxd_refcnt--;
5288
5289         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5290                 ioasid_free(domain->default_pasid);
5291 }
5292
5293 static int aux_domain_add_dev(struct dmar_domain *domain,
5294                               struct device *dev)
5295 {
5296         int ret;
5297         unsigned long flags;
5298         struct intel_iommu *iommu;
5299
5300         iommu = device_to_iommu(dev, NULL, NULL);
5301         if (!iommu)
5302                 return -ENODEV;
5303
5304         if (domain->default_pasid <= 0) {
5305                 u32 pasid;
5306
5307                 /* No private data needed for the default pasid */
5308                 pasid = ioasid_alloc(NULL, PASID_MIN,
5309                                      pci_max_pasids(to_pci_dev(dev)) - 1,
5310                                      NULL);
5311                 if (pasid == INVALID_IOASID) {
5312                         pr_err("Can't allocate default pasid\n");
5313                         return -ENODEV;
5314                 }
5315                 domain->default_pasid = pasid;
5316         }
5317
5318         spin_lock_irqsave(&device_domain_lock, flags);
5319         /*
5320          * iommu->lock must be held to attach domain to iommu and setup the
5321          * pasid entry for second level translation.
5322          */
5323         spin_lock(&iommu->lock);
5324         ret = domain_attach_iommu(domain, iommu);
5325         if (ret)
5326                 goto attach_failed;
5327
5328         /* Setup the PASID entry for mediated devices: */
5329         if (domain_use_first_level(domain))
5330                 ret = domain_setup_first_level(iommu, domain, dev,
5331                                                domain->default_pasid);
5332         else
5333                 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5334                                                      domain->default_pasid);
5335         if (ret)
5336                 goto table_failed;
5337         spin_unlock(&iommu->lock);
5338
5339         auxiliary_link_device(domain, dev);
5340
5341         spin_unlock_irqrestore(&device_domain_lock, flags);
5342
5343         return 0;
5344
5345 table_failed:
5346         domain_detach_iommu(domain, iommu);
5347 attach_failed:
5348         spin_unlock(&iommu->lock);
5349         spin_unlock_irqrestore(&device_domain_lock, flags);
5350         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5351                 ioasid_free(domain->default_pasid);
5352
5353         return ret;
5354 }
5355
5356 static void aux_domain_remove_dev(struct dmar_domain *domain,
5357                                   struct device *dev)
5358 {
5359         struct device_domain_info *info;
5360         struct intel_iommu *iommu;
5361         unsigned long flags;
5362
5363         if (!is_aux_domain(dev, &domain->domain))
5364                 return;
5365
5366         spin_lock_irqsave(&device_domain_lock, flags);
5367         info = get_domain_info(dev);
5368         iommu = info->iommu;
5369
5370         auxiliary_unlink_device(domain, dev);
5371
5372         spin_lock(&iommu->lock);
5373         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5374         domain_detach_iommu(domain, iommu);
5375         spin_unlock(&iommu->lock);
5376
5377         spin_unlock_irqrestore(&device_domain_lock, flags);
5378 }
5379
5380 static int prepare_domain_attach_device(struct iommu_domain *domain,
5381                                         struct device *dev)
5382 {
5383         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5384         struct intel_iommu *iommu;
5385         int addr_width;
5386
5387         iommu = device_to_iommu(dev, NULL, NULL);
5388         if (!iommu)
5389                 return -ENODEV;
5390
5391         /* check if this iommu agaw is sufficient for max mapped address */
5392         addr_width = agaw_to_width(iommu->agaw);
5393         if (addr_width > cap_mgaw(iommu->cap))
5394                 addr_width = cap_mgaw(iommu->cap);
5395
5396         if (dmar_domain->max_addr > (1LL << addr_width)) {
5397                 dev_err(dev, "%s: iommu width (%d) is not "
5398                         "sufficient for the mapped address (%llx)\n",
5399                         __func__, addr_width, dmar_domain->max_addr);
5400                 return -EFAULT;
5401         }
5402         dmar_domain->gaw = addr_width;
5403
5404         /*
5405          * Knock out extra levels of page tables if necessary
5406          */
5407         while (iommu->agaw < dmar_domain->agaw) {
5408                 struct dma_pte *pte;
5409
5410                 pte = dmar_domain->pgd;
5411                 if (dma_pte_present(pte)) {
5412                         dmar_domain->pgd = (struct dma_pte *)
5413                                 phys_to_virt(dma_pte_addr(pte));
5414                         free_pgtable_page(pte);
5415                 }
5416                 dmar_domain->agaw--;
5417         }
5418
5419         return 0;
5420 }
5421
5422 static int intel_iommu_attach_device(struct iommu_domain *domain,
5423                                      struct device *dev)
5424 {
5425         int ret;
5426
5427         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5428             device_is_rmrr_locked(dev)) {
5429                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5430                 return -EPERM;
5431         }
5432
5433         if (is_aux_domain(dev, domain))
5434                 return -EPERM;
5435
5436         /* normally dev is not mapped */
5437         if (unlikely(domain_context_mapped(dev))) {
5438                 struct dmar_domain *old_domain;
5439
5440                 old_domain = find_domain(dev);
5441                 if (old_domain)
5442                         dmar_remove_one_dev_info(dev);
5443         }
5444
5445         ret = prepare_domain_attach_device(domain, dev);
5446         if (ret)
5447                 return ret;
5448
5449         return domain_add_dev_info(to_dmar_domain(domain), dev);
5450 }
5451
5452 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5453                                          struct device *dev)
5454 {
5455         int ret;
5456
5457         if (!is_aux_domain(dev, domain))
5458                 return -EPERM;
5459
5460         ret = prepare_domain_attach_device(domain, dev);
5461         if (ret)
5462                 return ret;
5463
5464         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5465 }
5466
5467 static void intel_iommu_detach_device(struct iommu_domain *domain,
5468                                       struct device *dev)
5469 {
5470         dmar_remove_one_dev_info(dev);
5471 }
5472
5473 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5474                                           struct device *dev)
5475 {
5476         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5477 }
5478
5479 #ifdef CONFIG_INTEL_IOMMU_SVM
5480 /*
5481  * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5482  * VT-d granularity. Invalidation is typically included in the unmap operation
5483  * as a result of DMA or VFIO unmap. However, for assigned devices guest
5484  * owns the first level page tables. Invalidations of translation caches in the
5485  * guest are trapped and passed down to the host.
5486  *
5487  * vIOMMU in the guest will only expose first level page tables, therefore
5488  * we do not support IOTLB granularity for request without PASID (second level).
5489  *
5490  * For example, to find the VT-d granularity encoding for IOTLB
5491  * type and page selective granularity within PASID:
5492  * X: indexed by iommu cache type
5493  * Y: indexed by enum iommu_inv_granularity
5494  * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5495  */
5496
5497 static const int
5498 inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5499         /*
5500          * PASID based IOTLB invalidation: PASID selective (per PASID),
5501          * page selective (address granularity)
5502          */
5503         {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5504         /* PASID based dev TLBs */
5505         {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5506         /* PASID cache */
5507         {-EINVAL, -EINVAL, -EINVAL}
5508 };
5509
5510 static inline int to_vtd_granularity(int type, int granu)
5511 {
5512         return inv_type_granu_table[type][granu];
5513 }
5514
5515 static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5516 {
5517         u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5518
5519         /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5520          * IOMMU cache invalidate API passes granu_size in bytes, and number of
5521          * granu size in contiguous memory.
5522          */
5523         return order_base_2(nr_pages);
5524 }
5525
5526 static int
5527 intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5528                            struct iommu_cache_invalidate_info *inv_info)
5529 {
5530         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5531         struct device_domain_info *info;
5532         struct intel_iommu *iommu;
5533         unsigned long flags;
5534         int cache_type;
5535         u8 bus, devfn;
5536         u16 did, sid;
5537         int ret = 0;
5538         u64 size = 0;
5539
5540         if (!inv_info || !dmar_domain)
5541                 return -EINVAL;
5542
5543         if (!dev || !dev_is_pci(dev))
5544                 return -ENODEV;
5545
5546         iommu = device_to_iommu(dev, &bus, &devfn);
5547         if (!iommu)
5548                 return -ENODEV;
5549
5550         if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5551                 return -EINVAL;
5552
5553         spin_lock_irqsave(&device_domain_lock, flags);
5554         spin_lock(&iommu->lock);
5555         info = get_domain_info(dev);
5556         if (!info) {
5557                 ret = -EINVAL;
5558                 goto out_unlock;
5559         }
5560         did = dmar_domain->iommu_did[iommu->seq_id];
5561         sid = PCI_DEVID(bus, devfn);
5562
5563         /* Size is only valid in address selective invalidation */
5564         if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
5565                 size = to_vtd_size(inv_info->granu.addr_info.granule_size,
5566                                    inv_info->granu.addr_info.nb_granules);
5567
5568         for_each_set_bit(cache_type,
5569                          (unsigned long *)&inv_info->cache,
5570                          IOMMU_CACHE_INV_TYPE_NR) {
5571                 int granu = 0;
5572                 u64 pasid = 0;
5573                 u64 addr = 0;
5574
5575                 granu = to_vtd_granularity(cache_type, inv_info->granularity);
5576                 if (granu == -EINVAL) {
5577                         pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5578                                            cache_type, inv_info->granularity);
5579                         break;
5580                 }
5581
5582                 /*
5583                  * PASID is stored in different locations based on the
5584                  * granularity.
5585                  */
5586                 if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5587                     (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5588                         pasid = inv_info->granu.pasid_info.pasid;
5589                 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5590                          (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5591                         pasid = inv_info->granu.addr_info.pasid;
5592
5593                 switch (BIT(cache_type)) {
5594                 case IOMMU_CACHE_INV_TYPE_IOTLB:
5595                         /* HW will ignore LSB bits based on address mask */
5596                         if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5597                             size &&
5598                             (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5599                                 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
5600                                                    inv_info->granu.addr_info.addr, size);
5601                         }
5602
5603                         /*
5604                          * If granu is PASID-selective, address is ignored.
5605                          * We use npages = -1 to indicate that.
5606                          */
5607                         qi_flush_piotlb(iommu, did, pasid,
5608                                         mm_to_dma_pfn(inv_info->granu.addr_info.addr),
5609                                         (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5610                                         inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5611
5612                         if (!info->ats_enabled)
5613                                 break;
5614                         /*
5615                          * Always flush device IOTLB if ATS is enabled. vIOMMU
5616                          * in the guest may assume IOTLB flush is inclusive,
5617                          * which is more efficient.
5618                          */
5619                         fallthrough;
5620                 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5621                         /*
5622                          * PASID based device TLB invalidation does not support
5623                          * IOMMU_INV_GRANU_PASID granularity but only supports
5624                          * IOMMU_INV_GRANU_ADDR.
5625                          * The equivalent of that is we set the size to be the
5626                          * entire range of 64 bit. User only provides PASID info
5627                          * without address info. So we set addr to 0.
5628                          */
5629                         if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5630                                 size = 64 - VTD_PAGE_SHIFT;
5631                                 addr = 0;
5632                         } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5633                                 addr = inv_info->granu.addr_info.addr;
5634                         }
5635
5636                         if (info->ats_enabled)
5637                                 qi_flush_dev_iotlb_pasid(iommu, sid,
5638                                                 info->pfsid, pasid,
5639                                                 info->ats_qdep, addr,
5640                                                 size);
5641                         else
5642                                 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5643                         break;
5644                 default:
5645                         dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5646                                             cache_type);
5647                         ret = -EINVAL;
5648                 }
5649         }
5650 out_unlock:
5651         spin_unlock(&iommu->lock);
5652         spin_unlock_irqrestore(&device_domain_lock, flags);
5653
5654         return ret;
5655 }
5656 #endif
5657
5658 static int intel_iommu_map(struct iommu_domain *domain,
5659                            unsigned long iova, phys_addr_t hpa,
5660                            size_t size, int iommu_prot, gfp_t gfp)
5661 {
5662         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5663         u64 max_addr;
5664         int prot = 0;
5665         int ret;
5666
5667         if (iommu_prot & IOMMU_READ)
5668                 prot |= DMA_PTE_READ;
5669         if (iommu_prot & IOMMU_WRITE)
5670                 prot |= DMA_PTE_WRITE;
5671         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5672                 prot |= DMA_PTE_SNP;
5673
5674         max_addr = iova + size;
5675         if (dmar_domain->max_addr < max_addr) {
5676                 u64 end;
5677
5678                 /* check if minimum agaw is sufficient for mapped address */
5679                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5680                 if (end < max_addr) {
5681                         pr_err("%s: iommu width (%d) is not "
5682                                "sufficient for the mapped address (%llx)\n",
5683                                __func__, dmar_domain->gaw, max_addr);
5684                         return -EFAULT;
5685                 }
5686                 dmar_domain->max_addr = max_addr;
5687         }
5688         /* Round up size to next multiple of PAGE_SIZE, if it and
5689            the low bits of hpa would take us onto the next page */
5690         size = aligned_nrpages(hpa, size);
5691         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5692                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5693         return ret;
5694 }
5695
5696 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5697                                 unsigned long iova, size_t size,
5698                                 struct iommu_iotlb_gather *gather)
5699 {
5700         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5701         struct page *freelist = NULL;
5702         unsigned long start_pfn, last_pfn;
5703         unsigned int npages;
5704         int iommu_id, level = 0;
5705
5706         /* Cope with horrid API which requires us to unmap more than the
5707            size argument if it happens to be a large-page mapping. */
5708         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5709
5710         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5711                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5712
5713         start_pfn = iova >> VTD_PAGE_SHIFT;
5714         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5715
5716         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5717
5718         npages = last_pfn - start_pfn + 1;
5719
5720         for_each_domain_iommu(iommu_id, dmar_domain)
5721                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5722                                       start_pfn, npages, !freelist, 0);
5723
5724         dma_free_pagelist(freelist);
5725
5726         if (dmar_domain->max_addr == iova + size)
5727                 dmar_domain->max_addr = iova;
5728
5729         return size;
5730 }
5731
5732 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5733                                             dma_addr_t iova)
5734 {
5735         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5736         struct dma_pte *pte;
5737         int level = 0;
5738         u64 phys = 0;
5739
5740         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5741         if (pte && dma_pte_present(pte))
5742                 phys = dma_pte_addr(pte) +
5743                         (iova & (BIT_MASK(level_to_offset_bits(level) +
5744                                                 VTD_PAGE_SHIFT) - 1));
5745
5746         return phys;
5747 }
5748
5749 static inline bool scalable_mode_support(void)
5750 {
5751         struct dmar_drhd_unit *drhd;
5752         struct intel_iommu *iommu;
5753         bool ret = true;
5754
5755         rcu_read_lock();
5756         for_each_active_iommu(iommu, drhd) {
5757                 if (!sm_supported(iommu)) {
5758                         ret = false;
5759                         break;
5760                 }
5761         }
5762         rcu_read_unlock();
5763
5764         return ret;
5765 }
5766
5767 static inline bool iommu_pasid_support(void)
5768 {
5769         struct dmar_drhd_unit *drhd;
5770         struct intel_iommu *iommu;
5771         bool ret = true;
5772
5773         rcu_read_lock();
5774         for_each_active_iommu(iommu, drhd) {
5775                 if (!pasid_supported(iommu)) {
5776                         ret = false;
5777                         break;
5778                 }
5779         }
5780         rcu_read_unlock();
5781
5782         return ret;
5783 }
5784
5785 static inline bool nested_mode_support(void)
5786 {
5787         struct dmar_drhd_unit *drhd;
5788         struct intel_iommu *iommu;
5789         bool ret = true;
5790
5791         rcu_read_lock();
5792         for_each_active_iommu(iommu, drhd) {
5793                 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5794                         ret = false;
5795                         break;
5796                 }
5797         }
5798         rcu_read_unlock();
5799
5800         return ret;
5801 }
5802
5803 static bool intel_iommu_capable(enum iommu_cap cap)
5804 {
5805         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5806                 return domain_update_iommu_snooping(NULL) == 1;
5807         if (cap == IOMMU_CAP_INTR_REMAP)
5808                 return irq_remapping_enabled == 1;
5809
5810         return false;
5811 }
5812
5813 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5814 {
5815         struct intel_iommu *iommu;
5816
5817         iommu = device_to_iommu(dev, NULL, NULL);
5818         if (!iommu)
5819                 return ERR_PTR(-ENODEV);
5820
5821         if (translation_pre_enabled(iommu))
5822                 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5823
5824         return &iommu->iommu;
5825 }
5826
5827 static void intel_iommu_release_device(struct device *dev)
5828 {
5829         struct intel_iommu *iommu;
5830
5831         iommu = device_to_iommu(dev, NULL, NULL);
5832         if (!iommu)
5833                 return;
5834
5835         dmar_remove_one_dev_info(dev);
5836
5837         set_dma_ops(dev, NULL);
5838 }
5839
5840 static void intel_iommu_probe_finalize(struct device *dev)
5841 {
5842         struct iommu_domain *domain;
5843
5844         domain = iommu_get_domain_for_dev(dev);
5845         if (device_needs_bounce(dev))
5846                 set_dma_ops(dev, &bounce_dma_ops);
5847         else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5848                 set_dma_ops(dev, &intel_dma_ops);
5849         else
5850                 set_dma_ops(dev, NULL);
5851 }
5852
5853 static void intel_iommu_get_resv_regions(struct device *device,
5854                                          struct list_head *head)
5855 {
5856         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5857         struct iommu_resv_region *reg;
5858         struct dmar_rmrr_unit *rmrr;
5859         struct device *i_dev;
5860         int i;
5861
5862         down_read(&dmar_global_lock);
5863         for_each_rmrr_units(rmrr) {
5864                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5865                                           i, i_dev) {
5866                         struct iommu_resv_region *resv;
5867                         enum iommu_resv_type type;
5868                         size_t length;
5869
5870                         if (i_dev != device &&
5871                             !is_downstream_to_pci_bridge(device, i_dev))
5872                                 continue;
5873
5874                         length = rmrr->end_address - rmrr->base_address + 1;
5875
5876                         type = device_rmrr_is_relaxable(device) ?
5877                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5878
5879                         resv = iommu_alloc_resv_region(rmrr->base_address,
5880                                                        length, prot, type);
5881                         if (!resv)
5882                                 break;
5883
5884                         list_add_tail(&resv->list, head);
5885                 }
5886         }
5887         up_read(&dmar_global_lock);
5888
5889 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5890         if (dev_is_pci(device)) {
5891                 struct pci_dev *pdev = to_pci_dev(device);
5892
5893                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5894                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5895                                                    IOMMU_RESV_DIRECT_RELAXABLE);
5896                         if (reg)
5897                                 list_add_tail(&reg->list, head);
5898                 }
5899         }
5900 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5901
5902         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5903                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5904                                       0, IOMMU_RESV_MSI);
5905         if (!reg)
5906                 return;
5907         list_add_tail(&reg->list, head);
5908 }
5909
5910 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5911 {
5912         struct device_domain_info *info;
5913         struct context_entry *context;
5914         struct dmar_domain *domain;
5915         unsigned long flags;
5916         u64 ctx_lo;
5917         int ret;
5918
5919         domain = find_domain(dev);
5920         if (!domain)
5921                 return -EINVAL;
5922
5923         spin_lock_irqsave(&device_domain_lock, flags);
5924         spin_lock(&iommu->lock);
5925
5926         ret = -EINVAL;
5927         info = get_domain_info(dev);
5928         if (!info || !info->pasid_supported)
5929                 goto out;
5930
5931         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5932         if (WARN_ON(!context))
5933                 goto out;
5934
5935         ctx_lo = context[0].lo;
5936
5937         if (!(ctx_lo & CONTEXT_PASIDE)) {
5938                 ctx_lo |= CONTEXT_PASIDE;
5939                 context[0].lo = ctx_lo;
5940                 wmb();
5941                 iommu->flush.flush_context(iommu,
5942                                            domain->iommu_did[iommu->seq_id],
5943                                            PCI_DEVID(info->bus, info->devfn),
5944                                            DMA_CCMD_MASK_NOBIT,
5945                                            DMA_CCMD_DEVICE_INVL);
5946         }
5947
5948         /* Enable PASID support in the device, if it wasn't already */
5949         if (!info->pasid_enabled)
5950                 iommu_enable_dev_iotlb(info);
5951
5952         ret = 0;
5953
5954  out:
5955         spin_unlock(&iommu->lock);
5956         spin_unlock_irqrestore(&device_domain_lock, flags);
5957
5958         return ret;
5959 }
5960
5961 static void intel_iommu_apply_resv_region(struct device *dev,
5962                                           struct iommu_domain *domain,
5963                                           struct iommu_resv_region *region)
5964 {
5965         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5966         unsigned long start, end;
5967
5968         start = IOVA_PFN(region->start);
5969         end   = IOVA_PFN(region->start + region->length - 1);
5970
5971         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5972 }
5973
5974 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5975 {
5976         if (dev_is_pci(dev))
5977                 return pci_device_group(dev);
5978         return generic_device_group(dev);
5979 }
5980
5981 static int intel_iommu_enable_auxd(struct device *dev)
5982 {
5983         struct device_domain_info *info;
5984         struct intel_iommu *iommu;
5985         unsigned long flags;
5986         int ret;
5987
5988         iommu = device_to_iommu(dev, NULL, NULL);
5989         if (!iommu || dmar_disabled)
5990                 return -EINVAL;
5991
5992         if (!sm_supported(iommu) || !pasid_supported(iommu))
5993                 return -EINVAL;
5994
5995         ret = intel_iommu_enable_pasid(iommu, dev);
5996         if (ret)
5997                 return -ENODEV;
5998
5999         spin_lock_irqsave(&device_domain_lock, flags);
6000         info = get_domain_info(dev);
6001         info->auxd_enabled = 1;
6002         spin_unlock_irqrestore(&device_domain_lock, flags);
6003
6004         return 0;
6005 }
6006
6007 static int intel_iommu_disable_auxd(struct device *dev)
6008 {
6009         struct device_domain_info *info;
6010         unsigned long flags;
6011
6012         spin_lock_irqsave(&device_domain_lock, flags);
6013         info = get_domain_info(dev);
6014         if (!WARN_ON(!info))
6015                 info->auxd_enabled = 0;
6016         spin_unlock_irqrestore(&device_domain_lock, flags);
6017
6018         return 0;
6019 }
6020
6021 /*
6022  * A PCI express designated vendor specific extended capability is defined
6023  * in the section 3.7 of Intel scalable I/O virtualization technical spec
6024  * for system software and tools to detect endpoint devices supporting the
6025  * Intel scalable IO virtualization without host driver dependency.
6026  *
6027  * Returns the address of the matching extended capability structure within
6028  * the device's PCI configuration space or 0 if the device does not support
6029  * it.
6030  */
6031 static int siov_find_pci_dvsec(struct pci_dev *pdev)
6032 {
6033         int pos;
6034         u16 vendor, id;
6035
6036         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
6037         while (pos) {
6038                 pci_read_config_word(pdev, pos + 4, &vendor);
6039                 pci_read_config_word(pdev, pos + 8, &id);
6040                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
6041                         return pos;
6042
6043                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
6044         }
6045
6046         return 0;
6047 }
6048
6049 static bool
6050 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
6051 {
6052         if (feat == IOMMU_DEV_FEAT_AUX) {
6053                 int ret;
6054
6055                 if (!dev_is_pci(dev) || dmar_disabled ||
6056                     !scalable_mode_support() || !iommu_pasid_support())
6057                         return false;
6058
6059                 ret = pci_pasid_features(to_pci_dev(dev));
6060                 if (ret < 0)
6061                         return false;
6062
6063                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
6064         }
6065
6066         if (feat == IOMMU_DEV_FEAT_SVA) {
6067                 struct device_domain_info *info = get_domain_info(dev);
6068
6069                 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
6070                         info->pasid_supported && info->pri_supported &&
6071                         info->ats_supported;
6072         }
6073
6074         return false;
6075 }
6076
6077 static int
6078 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
6079 {
6080         if (feat == IOMMU_DEV_FEAT_AUX)
6081                 return intel_iommu_enable_auxd(dev);
6082
6083         if (feat == IOMMU_DEV_FEAT_SVA) {
6084                 struct device_domain_info *info = get_domain_info(dev);
6085
6086                 if (!info)
6087                         return -EINVAL;
6088
6089                 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
6090                         return 0;
6091         }
6092
6093         return -ENODEV;
6094 }
6095
6096 static int
6097 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
6098 {
6099         if (feat == IOMMU_DEV_FEAT_AUX)
6100                 return intel_iommu_disable_auxd(dev);
6101
6102         return -ENODEV;
6103 }
6104
6105 static bool
6106 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
6107 {
6108         struct device_domain_info *info = get_domain_info(dev);
6109
6110         if (feat == IOMMU_DEV_FEAT_AUX)
6111                 return scalable_mode_support() && info && info->auxd_enabled;
6112
6113         return false;
6114 }
6115
6116 static int
6117 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
6118 {
6119         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6120
6121         return dmar_domain->default_pasid > 0 ?
6122                         dmar_domain->default_pasid : -EINVAL;
6123 }
6124
6125 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6126                                            struct device *dev)
6127 {
6128         return attach_deferred(dev);
6129 }
6130
6131 static int
6132 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6133                             enum iommu_attr attr, void *data)
6134 {
6135         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6136         unsigned long flags;
6137         int ret = 0;
6138
6139         if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6140                 return -EINVAL;
6141
6142         switch (attr) {
6143         case DOMAIN_ATTR_NESTING:
6144                 spin_lock_irqsave(&device_domain_lock, flags);
6145                 if (nested_mode_support() &&
6146                     list_empty(&dmar_domain->devices)) {
6147                         dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6148                         dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6149                 } else {
6150                         ret = -ENODEV;
6151                 }
6152                 spin_unlock_irqrestore(&device_domain_lock, flags);
6153                 break;
6154         default:
6155                 ret = -EINVAL;
6156                 break;
6157         }
6158
6159         return ret;
6160 }
6161
6162 /*
6163  * Check that the device does not live on an external facing PCI port that is
6164  * marked as untrusted. Such devices should not be able to apply quirks and
6165  * thus not be able to bypass the IOMMU restrictions.
6166  */
6167 static bool risky_device(struct pci_dev *pdev)
6168 {
6169         if (pdev->untrusted) {
6170                 pci_info(pdev,
6171                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
6172                          pdev->vendor, pdev->device);
6173                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
6174                 return true;
6175         }
6176         return false;
6177 }
6178
6179 const struct iommu_ops intel_iommu_ops = {
6180         .capable                = intel_iommu_capable,
6181         .domain_alloc           = intel_iommu_domain_alloc,
6182         .domain_free            = intel_iommu_domain_free,
6183         .domain_set_attr        = intel_iommu_domain_set_attr,
6184         .attach_dev             = intel_iommu_attach_device,
6185         .detach_dev             = intel_iommu_detach_device,
6186         .aux_attach_dev         = intel_iommu_aux_attach_device,
6187         .aux_detach_dev         = intel_iommu_aux_detach_device,
6188         .aux_get_pasid          = intel_iommu_aux_get_pasid,
6189         .map                    = intel_iommu_map,
6190         .unmap                  = intel_iommu_unmap,
6191         .iova_to_phys           = intel_iommu_iova_to_phys,
6192         .probe_device           = intel_iommu_probe_device,
6193         .probe_finalize         = intel_iommu_probe_finalize,
6194         .release_device         = intel_iommu_release_device,
6195         .get_resv_regions       = intel_iommu_get_resv_regions,
6196         .put_resv_regions       = generic_iommu_put_resv_regions,
6197         .apply_resv_region      = intel_iommu_apply_resv_region,
6198         .device_group           = intel_iommu_device_group,
6199         .dev_has_feat           = intel_iommu_dev_has_feat,
6200         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
6201         .dev_enable_feat        = intel_iommu_dev_enable_feat,
6202         .dev_disable_feat       = intel_iommu_dev_disable_feat,
6203         .is_attach_deferred     = intel_iommu_is_attach_deferred,
6204         .def_domain_type        = device_def_domain_type,
6205         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
6206 #ifdef CONFIG_INTEL_IOMMU_SVM
6207         .cache_invalidate       = intel_iommu_sva_invalidate,
6208         .sva_bind_gpasid        = intel_svm_bind_gpasid,
6209         .sva_unbind_gpasid      = intel_svm_unbind_gpasid,
6210         .sva_bind               = intel_svm_bind,
6211         .sva_unbind             = intel_svm_unbind,
6212         .sva_get_pasid          = intel_svm_get_pasid,
6213         .page_response          = intel_svm_page_response,
6214 #endif
6215 };
6216
6217 static void quirk_iommu_igfx(struct pci_dev *dev)
6218 {
6219         if (risky_device(dev))
6220                 return;
6221
6222         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6223         dmar_map_gfx = 0;
6224 }
6225
6226 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6227 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6228 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6229 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6230 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6231 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6232 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6233 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6234
6235 /* Broadwell igfx malfunctions with dmar */
6236 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6237 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6238 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6239 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6240 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6241 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6242 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6243 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6244 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6245 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6246 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6247 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6248 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6249 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6250 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6251 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6252 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6253 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6254 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6255 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6256 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6257 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6258 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6259 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6260
6261 static void quirk_iommu_rwbf(struct pci_dev *dev)
6262 {
6263         if (risky_device(dev))
6264                 return;
6265
6266         /*
6267          * Mobile 4 Series Chipset neglects to set RWBF capability,
6268          * but needs it. Same seems to hold for the desktop versions.
6269          */
6270         pci_info(dev, "Forcing write-buffer flush capability\n");
6271         rwbf_quirk = 1;
6272 }
6273
6274 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6275 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6276 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6277 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6278 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6279 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6280 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6281
6282 #define GGC 0x52
6283 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
6284 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
6285 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
6286 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
6287 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
6288 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
6289 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
6290 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
6291
6292 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6293 {
6294         unsigned short ggc;
6295
6296         if (risky_device(dev))
6297                 return;
6298
6299         if (pci_read_config_word(dev, GGC, &ggc))
6300                 return;
6301
6302         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6303                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6304                 dmar_map_gfx = 0;
6305         } else if (dmar_map_gfx) {
6306                 /* we have to ensure the gfx device is idle before we flush */
6307                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6308                 intel_iommu_strict = 1;
6309        }
6310 }
6311 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6312 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6313 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6314 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6315
6316 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
6317 {
6318         unsigned short ver;
6319
6320         if (!IS_GFX_DEVICE(dev))
6321                 return;
6322
6323         ver = (dev->device >> 8) & 0xff;
6324         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
6325             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
6326             ver != 0x9a && ver != 0xa7)
6327                 return;
6328
6329         if (risky_device(dev))
6330                 return;
6331
6332         pci_info(dev, "Skip IOMMU disabling for graphics\n");
6333         iommu_skip_te_disable = 1;
6334 }
6335 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
6336
6337 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6338    ISOCH DMAR unit for the Azalia sound device, but not give it any
6339    TLB entries, which causes it to deadlock. Check for that.  We do
6340    this in a function called from init_dmars(), instead of in a PCI
6341    quirk, because we don't want to print the obnoxious "BIOS broken"
6342    message if VT-d is actually disabled.
6343 */
6344 static void __init check_tylersburg_isoch(void)
6345 {
6346         struct pci_dev *pdev;
6347         uint32_t vtisochctrl;
6348
6349         /* If there's no Azalia in the system anyway, forget it. */
6350         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6351         if (!pdev)
6352                 return;
6353
6354         if (risky_device(pdev)) {
6355                 pci_dev_put(pdev);
6356                 return;
6357         }
6358
6359         pci_dev_put(pdev);
6360
6361         /* System Management Registers. Might be hidden, in which case
6362            we can't do the sanity check. But that's OK, because the
6363            known-broken BIOSes _don't_ actually hide it, so far. */
6364         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6365         if (!pdev)
6366                 return;
6367
6368         if (risky_device(pdev)) {
6369                 pci_dev_put(pdev);
6370                 return;
6371         }
6372
6373         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6374                 pci_dev_put(pdev);
6375                 return;
6376         }
6377
6378         pci_dev_put(pdev);
6379
6380         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6381         if (vtisochctrl & 1)
6382                 return;
6383
6384         /* Drop all bits other than the number of TLB entries */
6385         vtisochctrl &= 0x1c;
6386
6387         /* If we have the recommended number of TLB entries (16), fine. */
6388         if (vtisochctrl == 0x10)
6389                 return;
6390
6391         /* Zero TLB entries? You get to ride the short bus to school. */
6392         if (!vtisochctrl) {
6393                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6394                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6395                      dmi_get_system_info(DMI_BIOS_VENDOR),
6396                      dmi_get_system_info(DMI_BIOS_VERSION),
6397                      dmi_get_system_info(DMI_PRODUCT_VERSION));
6398                 iommu_identity_mapping |= IDENTMAP_AZALIA;
6399                 return;
6400         }
6401
6402         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6403                vtisochctrl);
6404 }