GNU Linux-libre 5.4.241-gnu1
[releases.git] / drivers / iommu / intel-iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49
50 #include "irq_remapping.h"
51 #include "intel-pasid.h"
52
53 #define ROOT_SIZE               VTD_PAGE_SIZE
54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START      (0xfee00000)
62 #define IOAPIC_RANGE_END        (0xfeefffff)
63 #define IOVA_START_ADDR         (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN          (1)
81
82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
83
84 /* page table handling */
85 #define LEVEL_STRIDE            (9)
86 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
87
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
105
106 static inline int agaw_to_level(int agaw)
107 {
108         return agaw + 2;
109 }
110
111 static inline int agaw_to_width(int agaw)
112 {
113         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115
116 static inline int width_to_agaw(int width)
117 {
118         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123         return (level - 1) * LEVEL_STRIDE;
124 }
125
126 static inline int pfn_level_offset(u64 pfn, int level)
127 {
128         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130
131 static inline u64 level_mask(int level)
132 {
133         return -1ULL << level_to_offset_bits(level);
134 }
135
136 static inline u64 level_size(int level)
137 {
138         return 1ULL << level_to_offset_bits(level);
139 }
140
141 static inline u64 align_to_level(u64 pfn, int level)
142 {
143         return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148         return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164         return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168         return page_to_dma_pfn(virt_to_page(p));
169 }
170
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 static int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193         if (!(re->lo & 1))
194                 return 0;
195
196         return re->lo & VTD_PAGE_MASK;
197 }
198
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205         if (!(re->hi & 1))
206                 return 0;
207
208         return re->hi & VTD_PAGE_MASK;
209 }
210
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213         context->lo &= ~(1ULL << 11);
214 }
215
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218         return !!(context->lo & (1ULL << 11));
219 }
220
221 static inline void context_set_copied(struct context_entry *context)
222 {
223         context->hi |= (1ull << 3);
224 }
225
226 static inline bool context_copied(struct context_entry *context)
227 {
228         return !!(context->hi & (1ULL << 3));
229 }
230
231 static inline bool __context_present(struct context_entry *context)
232 {
233         return (context->lo & 1);
234 }
235
236 bool context_present(struct context_entry *context)
237 {
238         return context_pasid_enabled(context) ?
239              __context_present(context) :
240              __context_present(context) && !context_copied(context);
241 }
242
243 static inline void context_set_present(struct context_entry *context)
244 {
245         context->lo |= 1;
246 }
247
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250         context->lo &= (((u64)-1) << 2) | 1;
251 }
252
253 static inline void context_set_translation_type(struct context_entry *context,
254                                                 unsigned long value)
255 {
256         context->lo &= (((u64)-1) << 4) | 3;
257         context->lo |= (value & 3) << 2;
258 }
259
260 static inline void context_set_address_root(struct context_entry *context,
261                                             unsigned long value)
262 {
263         context->lo &= ~VTD_PAGE_MASK;
264         context->lo |= value & VTD_PAGE_MASK;
265 }
266
267 static inline void context_set_address_width(struct context_entry *context,
268                                              unsigned long value)
269 {
270         context->hi |= value & 7;
271 }
272
273 static inline void context_set_domain_id(struct context_entry *context,
274                                          unsigned long value)
275 {
276         context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278
279 static inline int context_domain_id(struct context_entry *c)
280 {
281         return((c->hi >> 8) & 0xffff);
282 }
283
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286         context->lo = 0;
287         context->hi = 0;
288 }
289
290 /*
291  * This domain is a statically identity mapping domain.
292  *      1. This domain creats a static 1:1 mapping to all usable memory.
293  *      2. It maps to each iommu if successful.
294  *      3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298
299 /* si_domain contains mulitple devices */
300 #define DOMAIN_FLAG_STATIC_IDENTITY             BIT(0)
301
302 /*
303  * This is a DMA domain allocated through the iommu domain allocation
304  * interface. But one or more devices belonging to this domain have
305  * been chosen to use a private domain. We should avoid to use the
306  * map/unmap/iova_to_phys APIs on it.
307  */
308 #define DOMAIN_FLAG_LOSE_CHILDREN               BIT(1)
309
310 #define for_each_domain_iommu(idx, domain)                      \
311         for (idx = 0; idx < g_num_of_iommus; idx++)             \
312                 if (domain->iommu_refcnt[idx])
313
314 struct dmar_rmrr_unit {
315         struct list_head list;          /* list of rmrr units   */
316         struct acpi_dmar_header *hdr;   /* ACPI header          */
317         u64     base_address;           /* reserved base address*/
318         u64     end_address;            /* reserved end address */
319         struct dmar_dev_scope *devices; /* target devices */
320         int     devices_cnt;            /* target device count */
321 };
322
323 struct dmar_atsr_unit {
324         struct list_head list;          /* list of ATSR units */
325         struct acpi_dmar_header *hdr;   /* ACPI header */
326         struct dmar_dev_scope *devices; /* target devices */
327         int devices_cnt;                /* target device count */
328         u8 include_all:1;               /* include all ports */
329 };
330
331 static LIST_HEAD(dmar_atsr_units);
332 static LIST_HEAD(dmar_rmrr_units);
333
334 #define for_each_rmrr_units(rmrr) \
335         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
336
337 /* bitmap for indexing intel_iommus */
338 static int g_num_of_iommus;
339
340 static void domain_exit(struct dmar_domain *domain);
341 static void domain_remove_dev_info(struct dmar_domain *domain);
342 static void dmar_remove_one_dev_info(struct device *dev);
343 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
344 static void domain_context_clear(struct intel_iommu *iommu,
345                                  struct device *dev);
346 static int domain_detach_iommu(struct dmar_domain *domain,
347                                struct intel_iommu *iommu);
348 static bool device_is_rmrr_locked(struct device *dev);
349 static int intel_iommu_attach_device(struct iommu_domain *domain,
350                                      struct device *dev);
351 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
352                                             dma_addr_t iova);
353
354 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
355 int dmar_disabled = 0;
356 #else
357 int dmar_disabled = 1;
358 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
359
360 int intel_iommu_sm;
361 int intel_iommu_enabled = 0;
362 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
363
364 static int dmar_map_gfx = 1;
365 static int dmar_forcedac;
366 static int intel_iommu_strict;
367 static int intel_iommu_superpage = 1;
368 static int iommu_identity_mapping;
369 static int intel_no_bounce;
370
371 #define IDENTMAP_ALL            1
372 #define IDENTMAP_GFX            2
373 #define IDENTMAP_AZALIA         4
374
375 int intel_iommu_gfx_mapped;
376 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
377
378 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
379 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
380 static DEFINE_SPINLOCK(device_domain_lock);
381 static LIST_HEAD(device_domain_list);
382
383 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&    \
384                                 to_pci_dev(d)->untrusted)
385
386 /*
387  * Iterate over elements in device_domain_list and call the specified
388  * callback @fn against each element.
389  */
390 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
391                                      void *data), void *data)
392 {
393         int ret = 0;
394         unsigned long flags;
395         struct device_domain_info *info;
396
397         spin_lock_irqsave(&device_domain_lock, flags);
398         list_for_each_entry(info, &device_domain_list, global) {
399                 ret = fn(info, data);
400                 if (ret) {
401                         spin_unlock_irqrestore(&device_domain_lock, flags);
402                         return ret;
403                 }
404         }
405         spin_unlock_irqrestore(&device_domain_lock, flags);
406
407         return 0;
408 }
409
410 const struct iommu_ops intel_iommu_ops;
411
412 static bool translation_pre_enabled(struct intel_iommu *iommu)
413 {
414         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
415 }
416
417 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
418 {
419         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
420 }
421
422 static void init_translation_status(struct intel_iommu *iommu)
423 {
424         u32 gsts;
425
426         gsts = readl(iommu->reg + DMAR_GSTS_REG);
427         if (gsts & DMA_GSTS_TES)
428                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
429 }
430
431 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
432 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
433 {
434         return container_of(dom, struct dmar_domain, domain);
435 }
436
437 static int __init intel_iommu_setup(char *str)
438 {
439         if (!str)
440                 return -EINVAL;
441         while (*str) {
442                 if (!strncmp(str, "on", 2)) {
443                         dmar_disabled = 0;
444                         pr_info("IOMMU enabled\n");
445                 } else if (!strncmp(str, "off", 3)) {
446                         dmar_disabled = 1;
447                         no_platform_optin = 1;
448                         pr_info("IOMMU disabled\n");
449                 } else if (!strncmp(str, "igfx_off", 8)) {
450                         dmar_map_gfx = 0;
451                         pr_info("Disable GFX device mapping\n");
452                 } else if (!strncmp(str, "forcedac", 8)) {
453                         pr_info("Forcing DAC for PCI devices\n");
454                         dmar_forcedac = 1;
455                 } else if (!strncmp(str, "strict", 6)) {
456                         pr_info("Disable batched IOTLB flush\n");
457                         intel_iommu_strict = 1;
458                 } else if (!strncmp(str, "sp_off", 6)) {
459                         pr_info("Disable supported super page\n");
460                         intel_iommu_superpage = 0;
461                 } else if (!strncmp(str, "sm_on", 5)) {
462                         pr_info("Intel-IOMMU: scalable mode supported\n");
463                         intel_iommu_sm = 1;
464                 } else if (!strncmp(str, "tboot_noforce", 13)) {
465                         printk(KERN_INFO
466                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
467                         intel_iommu_tboot_noforce = 1;
468                 } else if (!strncmp(str, "nobounce", 8)) {
469                         pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
470                         intel_no_bounce = 1;
471                 }
472
473                 str += strcspn(str, ",");
474                 while (*str == ',')
475                         str++;
476         }
477         return 0;
478 }
479 __setup("intel_iommu=", intel_iommu_setup);
480
481 static struct kmem_cache *iommu_domain_cache;
482 static struct kmem_cache *iommu_devinfo_cache;
483
484 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
485 {
486         struct dmar_domain **domains;
487         int idx = did >> 8;
488
489         domains = iommu->domains[idx];
490         if (!domains)
491                 return NULL;
492
493         return domains[did & 0xff];
494 }
495
496 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
497                              struct dmar_domain *domain)
498 {
499         struct dmar_domain **domains;
500         int idx = did >> 8;
501
502         if (!iommu->domains[idx]) {
503                 size_t size = 256 * sizeof(struct dmar_domain *);
504                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
505         }
506
507         domains = iommu->domains[idx];
508         if (WARN_ON(!domains))
509                 return;
510         else
511                 domains[did & 0xff] = domain;
512 }
513
514 void *alloc_pgtable_page(int node)
515 {
516         struct page *page;
517         void *vaddr = NULL;
518
519         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
520         if (page)
521                 vaddr = page_address(page);
522         return vaddr;
523 }
524
525 void free_pgtable_page(void *vaddr)
526 {
527         free_page((unsigned long)vaddr);
528 }
529
530 static inline void *alloc_domain_mem(void)
531 {
532         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
533 }
534
535 static void free_domain_mem(void *vaddr)
536 {
537         kmem_cache_free(iommu_domain_cache, vaddr);
538 }
539
540 static inline void * alloc_devinfo_mem(void)
541 {
542         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
543 }
544
545 static inline void free_devinfo_mem(void *vaddr)
546 {
547         kmem_cache_free(iommu_devinfo_cache, vaddr);
548 }
549
550 static inline int domain_type_is_si(struct dmar_domain *domain)
551 {
552         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
553 }
554
555 static inline int domain_pfn_supported(struct dmar_domain *domain,
556                                        unsigned long pfn)
557 {
558         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
559
560         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
561 }
562
563 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
564 {
565         unsigned long sagaw;
566         int agaw = -1;
567
568         sagaw = cap_sagaw(iommu->cap);
569         for (agaw = width_to_agaw(max_gaw);
570              agaw >= 0; agaw--) {
571                 if (test_bit(agaw, &sagaw))
572                         break;
573         }
574
575         return agaw;
576 }
577
578 /*
579  * Calculate max SAGAW for each iommu.
580  */
581 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
582 {
583         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
584 }
585
586 /*
587  * calculate agaw for each iommu.
588  * "SAGAW" may be different across iommus, use a default agaw, and
589  * get a supported less agaw for iommus that don't support the default agaw.
590  */
591 int iommu_calculate_agaw(struct intel_iommu *iommu)
592 {
593         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
594 }
595
596 /* This functionin only returns single iommu in a domain */
597 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
598 {
599         int iommu_id;
600
601         /* si_domain and vm domain should not get here. */
602         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
603                 return NULL;
604
605         for_each_domain_iommu(iommu_id, domain)
606                 break;
607
608         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
609                 return NULL;
610
611         return g_iommus[iommu_id];
612 }
613
614 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
615 {
616         return sm_supported(iommu) ?
617                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
618 }
619
620 static void domain_update_iommu_coherency(struct dmar_domain *domain)
621 {
622         struct dmar_drhd_unit *drhd;
623         struct intel_iommu *iommu;
624         bool found = false;
625         int i;
626
627         domain->iommu_coherency = 1;
628
629         for_each_domain_iommu(i, domain) {
630                 found = true;
631                 if (!iommu_paging_structure_coherency(g_iommus[i])) {
632                         domain->iommu_coherency = 0;
633                         break;
634                 }
635         }
636         if (found)
637                 return;
638
639         /* No hardware attached; use lowest common denominator */
640         rcu_read_lock();
641         for_each_active_iommu(iommu, drhd) {
642                 if (!iommu_paging_structure_coherency(iommu)) {
643                         domain->iommu_coherency = 0;
644                         break;
645                 }
646         }
647         rcu_read_unlock();
648 }
649
650 static int domain_update_iommu_snooping(struct intel_iommu *skip)
651 {
652         struct dmar_drhd_unit *drhd;
653         struct intel_iommu *iommu;
654         int ret = 1;
655
656         rcu_read_lock();
657         for_each_active_iommu(iommu, drhd) {
658                 if (iommu != skip) {
659                         if (!ecap_sc_support(iommu->ecap)) {
660                                 ret = 0;
661                                 break;
662                         }
663                 }
664         }
665         rcu_read_unlock();
666
667         return ret;
668 }
669
670 static int domain_update_iommu_superpage(struct intel_iommu *skip)
671 {
672         struct dmar_drhd_unit *drhd;
673         struct intel_iommu *iommu;
674         int mask = 0xf;
675
676         if (!intel_iommu_superpage) {
677                 return 0;
678         }
679
680         /* set iommu_superpage to the smallest common denominator */
681         rcu_read_lock();
682         for_each_active_iommu(iommu, drhd) {
683                 if (iommu != skip) {
684                         mask &= cap_super_page_val(iommu->cap);
685                         if (!mask)
686                                 break;
687                 }
688         }
689         rcu_read_unlock();
690
691         return fls(mask);
692 }
693
694 /* Some capabilities may be different across iommus */
695 static void domain_update_iommu_cap(struct dmar_domain *domain)
696 {
697         domain_update_iommu_coherency(domain);
698         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
699         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
700 }
701
702 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
703                                          u8 devfn, int alloc)
704 {
705         struct root_entry *root = &iommu->root_entry[bus];
706         struct context_entry *context;
707         u64 *entry;
708
709         entry = &root->lo;
710         if (sm_supported(iommu)) {
711                 if (devfn >= 0x80) {
712                         devfn -= 0x80;
713                         entry = &root->hi;
714                 }
715                 devfn *= 2;
716         }
717         if (*entry & 1)
718                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
719         else {
720                 unsigned long phy_addr;
721                 if (!alloc)
722                         return NULL;
723
724                 context = alloc_pgtable_page(iommu->node);
725                 if (!context)
726                         return NULL;
727
728                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
729                 phy_addr = virt_to_phys((void *)context);
730                 *entry = phy_addr | 1;
731                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
732         }
733         return &context[devfn];
734 }
735
736 static int iommu_dummy(struct device *dev)
737 {
738         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
739 }
740
741 /**
742  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
743  *                               sub-hierarchy of a candidate PCI-PCI bridge
744  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
745  * @bridge: the candidate PCI-PCI bridge
746  *
747  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
748  */
749 static bool
750 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
751 {
752         struct pci_dev *pdev, *pbridge;
753
754         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
755                 return false;
756
757         pdev = to_pci_dev(dev);
758         pbridge = to_pci_dev(bridge);
759
760         if (pbridge->subordinate &&
761             pbridge->subordinate->number <= pdev->bus->number &&
762             pbridge->subordinate->busn_res.end >= pdev->bus->number)
763                 return true;
764
765         return false;
766 }
767
768 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
769 {
770         struct dmar_drhd_unit *drhd = NULL;
771         struct intel_iommu *iommu;
772         struct device *tmp;
773         struct pci_dev *pdev = NULL;
774         u16 segment = 0;
775         int i;
776
777         if (iommu_dummy(dev))
778                 return NULL;
779
780         if (dev_is_pci(dev)) {
781                 struct pci_dev *pf_pdev;
782
783                 pdev = to_pci_dev(dev);
784
785 #ifdef CONFIG_X86
786                 /* VMD child devices currently cannot be handled individually */
787                 if (is_vmd(pdev->bus))
788                         return NULL;
789 #endif
790
791                 /* VFs aren't listed in scope tables; we need to look up
792                  * the PF instead to find the IOMMU. */
793                 pf_pdev = pci_physfn(pdev);
794                 dev = &pf_pdev->dev;
795                 segment = pci_domain_nr(pdev->bus);
796         } else if (has_acpi_companion(dev))
797                 dev = &ACPI_COMPANION(dev)->dev;
798
799         rcu_read_lock();
800         for_each_active_iommu(iommu, drhd) {
801                 if (pdev && segment != drhd->segment)
802                         continue;
803
804                 for_each_active_dev_scope(drhd->devices,
805                                           drhd->devices_cnt, i, tmp) {
806                         if (tmp == dev) {
807                                 /* For a VF use its original BDF# not that of the PF
808                                  * which we used for the IOMMU lookup. Strictly speaking
809                                  * we could do this for all PCI devices; we only need to
810                                  * get the BDF# from the scope table for ACPI matches. */
811                                 if (pdev && pdev->is_virtfn)
812                                         goto got_pdev;
813
814                                 *bus = drhd->devices[i].bus;
815                                 *devfn = drhd->devices[i].devfn;
816                                 goto out;
817                         }
818
819                         if (is_downstream_to_pci_bridge(dev, tmp))
820                                 goto got_pdev;
821                 }
822
823                 if (pdev && drhd->include_all) {
824                 got_pdev:
825                         *bus = pdev->bus->number;
826                         *devfn = pdev->devfn;
827                         goto out;
828                 }
829         }
830         iommu = NULL;
831  out:
832         rcu_read_unlock();
833
834         return iommu;
835 }
836
837 static void domain_flush_cache(struct dmar_domain *domain,
838                                void *addr, int size)
839 {
840         if (!domain->iommu_coherency)
841                 clflush_cache_range(addr, size);
842 }
843
844 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
845 {
846         struct context_entry *context;
847         int ret = 0;
848         unsigned long flags;
849
850         spin_lock_irqsave(&iommu->lock, flags);
851         context = iommu_context_addr(iommu, bus, devfn, 0);
852         if (context)
853                 ret = context_present(context);
854         spin_unlock_irqrestore(&iommu->lock, flags);
855         return ret;
856 }
857
858 static void free_context_table(struct intel_iommu *iommu)
859 {
860         int i;
861         unsigned long flags;
862         struct context_entry *context;
863
864         spin_lock_irqsave(&iommu->lock, flags);
865         if (!iommu->root_entry) {
866                 goto out;
867         }
868         for (i = 0; i < ROOT_ENTRY_NR; i++) {
869                 context = iommu_context_addr(iommu, i, 0, 0);
870                 if (context)
871                         free_pgtable_page(context);
872
873                 if (!sm_supported(iommu))
874                         continue;
875
876                 context = iommu_context_addr(iommu, i, 0x80, 0);
877                 if (context)
878                         free_pgtable_page(context);
879
880         }
881         free_pgtable_page(iommu->root_entry);
882         iommu->root_entry = NULL;
883 out:
884         spin_unlock_irqrestore(&iommu->lock, flags);
885 }
886
887 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
888                                       unsigned long pfn, int *target_level)
889 {
890         struct dma_pte *parent, *pte;
891         int level = agaw_to_level(domain->agaw);
892         int offset;
893
894         BUG_ON(!domain->pgd);
895
896         if (!domain_pfn_supported(domain, pfn))
897                 /* Address beyond IOMMU's addressing capabilities. */
898                 return NULL;
899
900         parent = domain->pgd;
901
902         while (1) {
903                 void *tmp_page;
904
905                 offset = pfn_level_offset(pfn, level);
906                 pte = &parent[offset];
907                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
908                         break;
909                 if (level == *target_level)
910                         break;
911
912                 if (!dma_pte_present(pte)) {
913                         uint64_t pteval;
914
915                         tmp_page = alloc_pgtable_page(domain->nid);
916
917                         if (!tmp_page)
918                                 return NULL;
919
920                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
921                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
922                         if (cmpxchg64(&pte->val, 0ULL, pteval))
923                                 /* Someone else set it while we were thinking; use theirs. */
924                                 free_pgtable_page(tmp_page);
925                         else
926                                 domain_flush_cache(domain, pte, sizeof(*pte));
927                 }
928                 if (level == 1)
929                         break;
930
931                 parent = phys_to_virt(dma_pte_addr(pte));
932                 level--;
933         }
934
935         if (!*target_level)
936                 *target_level = level;
937
938         return pte;
939 }
940
941 /* return address's pte at specific level */
942 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
943                                          unsigned long pfn,
944                                          int level, int *large_page)
945 {
946         struct dma_pte *parent, *pte;
947         int total = agaw_to_level(domain->agaw);
948         int offset;
949
950         parent = domain->pgd;
951         while (level <= total) {
952                 offset = pfn_level_offset(pfn, total);
953                 pte = &parent[offset];
954                 if (level == total)
955                         return pte;
956
957                 if (!dma_pte_present(pte)) {
958                         *large_page = total;
959                         break;
960                 }
961
962                 if (dma_pte_superpage(pte)) {
963                         *large_page = total;
964                         return pte;
965                 }
966
967                 parent = phys_to_virt(dma_pte_addr(pte));
968                 total--;
969         }
970         return NULL;
971 }
972
973 /* clear last level pte, a tlb flush should be followed */
974 static void dma_pte_clear_range(struct dmar_domain *domain,
975                                 unsigned long start_pfn,
976                                 unsigned long last_pfn)
977 {
978         unsigned int large_page;
979         struct dma_pte *first_pte, *pte;
980
981         BUG_ON(!domain_pfn_supported(domain, start_pfn));
982         BUG_ON(!domain_pfn_supported(domain, last_pfn));
983         BUG_ON(start_pfn > last_pfn);
984
985         /* we don't need lock here; nobody else touches the iova range */
986         do {
987                 large_page = 1;
988                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
989                 if (!pte) {
990                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
991                         continue;
992                 }
993                 do {
994                         dma_clear_pte(pte);
995                         start_pfn += lvl_to_nr_pages(large_page);
996                         pte++;
997                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
998
999                 domain_flush_cache(domain, first_pte,
1000                                    (void *)pte - (void *)first_pte);
1001
1002         } while (start_pfn && start_pfn <= last_pfn);
1003 }
1004
1005 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1006                                int retain_level, struct dma_pte *pte,
1007                                unsigned long pfn, unsigned long start_pfn,
1008                                unsigned long last_pfn)
1009 {
1010         pfn = max(start_pfn, pfn);
1011         pte = &pte[pfn_level_offset(pfn, level)];
1012
1013         do {
1014                 unsigned long level_pfn;
1015                 struct dma_pte *level_pte;
1016
1017                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1018                         goto next;
1019
1020                 level_pfn = pfn & level_mask(level);
1021                 level_pte = phys_to_virt(dma_pte_addr(pte));
1022
1023                 if (level > 2) {
1024                         dma_pte_free_level(domain, level - 1, retain_level,
1025                                            level_pte, level_pfn, start_pfn,
1026                                            last_pfn);
1027                 }
1028
1029                 /*
1030                  * Free the page table if we're below the level we want to
1031                  * retain and the range covers the entire table.
1032                  */
1033                 if (level < retain_level && !(start_pfn > level_pfn ||
1034                       last_pfn < level_pfn + level_size(level) - 1)) {
1035                         dma_clear_pte(pte);
1036                         domain_flush_cache(domain, pte, sizeof(*pte));
1037                         free_pgtable_page(level_pte);
1038                 }
1039 next:
1040                 pfn += level_size(level);
1041         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1042 }
1043
1044 /*
1045  * clear last level (leaf) ptes and free page table pages below the
1046  * level we wish to keep intact.
1047  */
1048 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1049                                    unsigned long start_pfn,
1050                                    unsigned long last_pfn,
1051                                    int retain_level)
1052 {
1053         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1054         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1055         BUG_ON(start_pfn > last_pfn);
1056
1057         dma_pte_clear_range(domain, start_pfn, last_pfn);
1058
1059         /* We don't need lock here; nobody else touches the iova range */
1060         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1061                            domain->pgd, 0, start_pfn, last_pfn);
1062
1063         /* free pgd */
1064         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1065                 free_pgtable_page(domain->pgd);
1066                 domain->pgd = NULL;
1067         }
1068 }
1069
1070 /* When a page at a given level is being unlinked from its parent, we don't
1071    need to *modify* it at all. All we need to do is make a list of all the
1072    pages which can be freed just as soon as we've flushed the IOTLB and we
1073    know the hardware page-walk will no longer touch them.
1074    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1075    be freed. */
1076 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1077                                             int level, struct dma_pte *pte,
1078                                             struct page *freelist)
1079 {
1080         struct page *pg;
1081
1082         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1083         pg->freelist = freelist;
1084         freelist = pg;
1085
1086         if (level == 1)
1087                 return freelist;
1088
1089         pte = page_address(pg);
1090         do {
1091                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1092                         freelist = dma_pte_list_pagetables(domain, level - 1,
1093                                                            pte, freelist);
1094                 pte++;
1095         } while (!first_pte_in_page(pte));
1096
1097         return freelist;
1098 }
1099
1100 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1101                                         struct dma_pte *pte, unsigned long pfn,
1102                                         unsigned long start_pfn,
1103                                         unsigned long last_pfn,
1104                                         struct page *freelist)
1105 {
1106         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1107
1108         pfn = max(start_pfn, pfn);
1109         pte = &pte[pfn_level_offset(pfn, level)];
1110
1111         do {
1112                 unsigned long level_pfn;
1113
1114                 if (!dma_pte_present(pte))
1115                         goto next;
1116
1117                 level_pfn = pfn & level_mask(level);
1118
1119                 /* If range covers entire pagetable, free it */
1120                 if (start_pfn <= level_pfn &&
1121                     last_pfn >= level_pfn + level_size(level) - 1) {
1122                         /* These suborbinate page tables are going away entirely. Don't
1123                            bother to clear them; we're just going to *free* them. */
1124                         if (level > 1 && !dma_pte_superpage(pte))
1125                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1126
1127                         dma_clear_pte(pte);
1128                         if (!first_pte)
1129                                 first_pte = pte;
1130                         last_pte = pte;
1131                 } else if (level > 1) {
1132                         /* Recurse down into a level that isn't *entirely* obsolete */
1133                         freelist = dma_pte_clear_level(domain, level - 1,
1134                                                        phys_to_virt(dma_pte_addr(pte)),
1135                                                        level_pfn, start_pfn, last_pfn,
1136                                                        freelist);
1137                 }
1138 next:
1139                 pfn += level_size(level);
1140         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1141
1142         if (first_pte)
1143                 domain_flush_cache(domain, first_pte,
1144                                    (void *)++last_pte - (void *)first_pte);
1145
1146         return freelist;
1147 }
1148
1149 /* We can't just free the pages because the IOMMU may still be walking
1150    the page tables, and may have cached the intermediate levels. The
1151    pages can only be freed after the IOTLB flush has been done. */
1152 static struct page *domain_unmap(struct dmar_domain *domain,
1153                                  unsigned long start_pfn,
1154                                  unsigned long last_pfn)
1155 {
1156         struct page *freelist;
1157
1158         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1159         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1160         BUG_ON(start_pfn > last_pfn);
1161
1162         /* we don't need lock here; nobody else touches the iova range */
1163         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1164                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1165
1166         /* free pgd */
1167         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1168                 struct page *pgd_page = virt_to_page(domain->pgd);
1169                 pgd_page->freelist = freelist;
1170                 freelist = pgd_page;
1171
1172                 domain->pgd = NULL;
1173         }
1174
1175         return freelist;
1176 }
1177
1178 static void dma_free_pagelist(struct page *freelist)
1179 {
1180         struct page *pg;
1181
1182         while ((pg = freelist)) {
1183                 freelist = pg->freelist;
1184                 free_pgtable_page(page_address(pg));
1185         }
1186 }
1187
1188 static void iova_entry_free(unsigned long data)
1189 {
1190         struct page *freelist = (struct page *)data;
1191
1192         dma_free_pagelist(freelist);
1193 }
1194
1195 /* iommu handling */
1196 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1197 {
1198         struct root_entry *root;
1199         unsigned long flags;
1200
1201         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1202         if (!root) {
1203                 pr_err("Allocating root entry for %s failed\n",
1204                         iommu->name);
1205                 return -ENOMEM;
1206         }
1207
1208         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1209
1210         spin_lock_irqsave(&iommu->lock, flags);
1211         iommu->root_entry = root;
1212         spin_unlock_irqrestore(&iommu->lock, flags);
1213
1214         return 0;
1215 }
1216
1217 static void iommu_set_root_entry(struct intel_iommu *iommu)
1218 {
1219         u64 addr;
1220         u32 sts;
1221         unsigned long flag;
1222
1223         addr = virt_to_phys(iommu->root_entry);
1224         if (sm_supported(iommu))
1225                 addr |= DMA_RTADDR_SMT;
1226
1227         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1228         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1229
1230         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1231
1232         /* Make sure hardware complete it */
1233         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1234                       readl, (sts & DMA_GSTS_RTPS), sts);
1235
1236         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1237 }
1238
1239 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1240 {
1241         u32 val;
1242         unsigned long flag;
1243
1244         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1245                 return;
1246
1247         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1248         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1249
1250         /* Make sure hardware complete it */
1251         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1252                       readl, (!(val & DMA_GSTS_WBFS)), val);
1253
1254         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1255 }
1256
1257 /* return value determine if we need a write buffer flush */
1258 static void __iommu_flush_context(struct intel_iommu *iommu,
1259                                   u16 did, u16 source_id, u8 function_mask,
1260                                   u64 type)
1261 {
1262         u64 val = 0;
1263         unsigned long flag;
1264
1265         switch (type) {
1266         case DMA_CCMD_GLOBAL_INVL:
1267                 val = DMA_CCMD_GLOBAL_INVL;
1268                 break;
1269         case DMA_CCMD_DOMAIN_INVL:
1270                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1271                 break;
1272         case DMA_CCMD_DEVICE_INVL:
1273                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1274                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1275                 break;
1276         default:
1277                 BUG();
1278         }
1279         val |= DMA_CCMD_ICC;
1280
1281         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1282         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1283
1284         /* Make sure hardware complete it */
1285         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1286                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1287
1288         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1289 }
1290
1291 /* return value determine if we need a write buffer flush */
1292 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1293                                 u64 addr, unsigned int size_order, u64 type)
1294 {
1295         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1296         u64 val = 0, val_iva = 0;
1297         unsigned long flag;
1298
1299         switch (type) {
1300         case DMA_TLB_GLOBAL_FLUSH:
1301                 /* global flush doesn't need set IVA_REG */
1302                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1303                 break;
1304         case DMA_TLB_DSI_FLUSH:
1305                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1306                 break;
1307         case DMA_TLB_PSI_FLUSH:
1308                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1309                 /* IH bit is passed in as part of address */
1310                 val_iva = size_order | addr;
1311                 break;
1312         default:
1313                 BUG();
1314         }
1315         /* Note: set drain read/write */
1316 #if 0
1317         /*
1318          * This is probably to be super secure.. Looks like we can
1319          * ignore it without any impact.
1320          */
1321         if (cap_read_drain(iommu->cap))
1322                 val |= DMA_TLB_READ_DRAIN;
1323 #endif
1324         if (cap_write_drain(iommu->cap))
1325                 val |= DMA_TLB_WRITE_DRAIN;
1326
1327         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1328         /* Note: Only uses first TLB reg currently */
1329         if (val_iva)
1330                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1331         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1332
1333         /* Make sure hardware complete it */
1334         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1335                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1336
1337         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1338
1339         /* check IOTLB invalidation granularity */
1340         if (DMA_TLB_IAIG(val) == 0)
1341                 pr_err("Flush IOTLB failed\n");
1342         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1343                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1344                         (unsigned long long)DMA_TLB_IIRG(type),
1345                         (unsigned long long)DMA_TLB_IAIG(val));
1346 }
1347
1348 static struct device_domain_info *
1349 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1350                          u8 bus, u8 devfn)
1351 {
1352         struct device_domain_info *info;
1353
1354         assert_spin_locked(&device_domain_lock);
1355
1356         if (!iommu->qi)
1357                 return NULL;
1358
1359         list_for_each_entry(info, &domain->devices, link)
1360                 if (info->iommu == iommu && info->bus == bus &&
1361                     info->devfn == devfn) {
1362                         if (info->ats_supported && info->dev)
1363                                 return info;
1364                         break;
1365                 }
1366
1367         return NULL;
1368 }
1369
1370 static void domain_update_iotlb(struct dmar_domain *domain)
1371 {
1372         struct device_domain_info *info;
1373         bool has_iotlb_device = false;
1374
1375         assert_spin_locked(&device_domain_lock);
1376
1377         list_for_each_entry(info, &domain->devices, link) {
1378                 struct pci_dev *pdev;
1379
1380                 if (!info->dev || !dev_is_pci(info->dev))
1381                         continue;
1382
1383                 pdev = to_pci_dev(info->dev);
1384                 if (pdev->ats_enabled) {
1385                         has_iotlb_device = true;
1386                         break;
1387                 }
1388         }
1389
1390         domain->has_iotlb_device = has_iotlb_device;
1391 }
1392
1393 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1394 {
1395         struct pci_dev *pdev;
1396
1397         assert_spin_locked(&device_domain_lock);
1398
1399         if (!info || !dev_is_pci(info->dev))
1400                 return;
1401
1402         pdev = to_pci_dev(info->dev);
1403         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1404          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1405          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1406          * reserved, which should be set to 0.
1407          */
1408         if (!ecap_dit(info->iommu->ecap))
1409                 info->pfsid = 0;
1410         else {
1411                 struct pci_dev *pf_pdev;
1412
1413                 /* pdev will be returned if device is not a vf */
1414                 pf_pdev = pci_physfn(pdev);
1415                 info->pfsid = pci_dev_id(pf_pdev);
1416         }
1417
1418 #ifdef CONFIG_INTEL_IOMMU_SVM
1419         /* The PCIe spec, in its wisdom, declares that the behaviour of
1420            the device if you enable PASID support after ATS support is
1421            undefined. So always enable PASID support on devices which
1422            have it, even if we can't yet know if we're ever going to
1423            use it. */
1424         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1425                 info->pasid_enabled = 1;
1426
1427         if (info->pri_supported &&
1428             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1429             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1430                 info->pri_enabled = 1;
1431 #endif
1432         if (!pdev->untrusted && info->ats_supported &&
1433             pci_ats_page_aligned(pdev) &&
1434             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1435                 info->ats_enabled = 1;
1436                 domain_update_iotlb(info->domain);
1437                 info->ats_qdep = pci_ats_queue_depth(pdev);
1438         }
1439 }
1440
1441 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1442 {
1443         struct pci_dev *pdev;
1444
1445         assert_spin_locked(&device_domain_lock);
1446
1447         if (!dev_is_pci(info->dev))
1448                 return;
1449
1450         pdev = to_pci_dev(info->dev);
1451
1452         if (info->ats_enabled) {
1453                 pci_disable_ats(pdev);
1454                 info->ats_enabled = 0;
1455                 domain_update_iotlb(info->domain);
1456         }
1457 #ifdef CONFIG_INTEL_IOMMU_SVM
1458         if (info->pri_enabled) {
1459                 pci_disable_pri(pdev);
1460                 info->pri_enabled = 0;
1461         }
1462         if (info->pasid_enabled) {
1463                 pci_disable_pasid(pdev);
1464                 info->pasid_enabled = 0;
1465         }
1466 #endif
1467 }
1468
1469 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1470                                   u64 addr, unsigned mask)
1471 {
1472         u16 sid, qdep;
1473         unsigned long flags;
1474         struct device_domain_info *info;
1475
1476         if (!domain->has_iotlb_device)
1477                 return;
1478
1479         spin_lock_irqsave(&device_domain_lock, flags);
1480         list_for_each_entry(info, &domain->devices, link) {
1481                 if (!info->ats_enabled)
1482                         continue;
1483
1484                 sid = info->bus << 8 | info->devfn;
1485                 qdep = info->ats_qdep;
1486                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1487                                 qdep, addr, mask);
1488         }
1489         spin_unlock_irqrestore(&device_domain_lock, flags);
1490 }
1491
1492 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1493                                   struct dmar_domain *domain,
1494                                   unsigned long pfn, unsigned int pages,
1495                                   int ih, int map)
1496 {
1497         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1498         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1499         u16 did = domain->iommu_did[iommu->seq_id];
1500
1501         BUG_ON(pages == 0);
1502
1503         if (ih)
1504                 ih = 1 << 6;
1505         /*
1506          * Fallback to domain selective flush if no PSI support or the size is
1507          * too big.
1508          * PSI requires page size to be 2 ^ x, and the base address is naturally
1509          * aligned to the size
1510          */
1511         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1512                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1513                                                 DMA_TLB_DSI_FLUSH);
1514         else
1515                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1516                                                 DMA_TLB_PSI_FLUSH);
1517
1518         /*
1519          * In caching mode, changes of pages from non-present to present require
1520          * flush. However, device IOTLB doesn't need to be flushed in this case.
1521          */
1522         if (!cap_caching_mode(iommu->cap) || !map)
1523                 iommu_flush_dev_iotlb(domain, addr, mask);
1524 }
1525
1526 /* Notification for newly created mappings */
1527 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1528                                         struct dmar_domain *domain,
1529                                         unsigned long pfn, unsigned int pages)
1530 {
1531         /* It's a non-present to present mapping. Only flush if caching mode */
1532         if (cap_caching_mode(iommu->cap))
1533                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1534         else
1535                 iommu_flush_write_buffer(iommu);
1536 }
1537
1538 static void iommu_flush_iova(struct iova_domain *iovad)
1539 {
1540         struct dmar_domain *domain;
1541         int idx;
1542
1543         domain = container_of(iovad, struct dmar_domain, iovad);
1544
1545         for_each_domain_iommu(idx, domain) {
1546                 struct intel_iommu *iommu = g_iommus[idx];
1547                 u16 did = domain->iommu_did[iommu->seq_id];
1548
1549                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1550
1551                 if (!cap_caching_mode(iommu->cap))
1552                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1553                                               0, MAX_AGAW_PFN_WIDTH);
1554         }
1555 }
1556
1557 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1558 {
1559         u32 pmen;
1560         unsigned long flags;
1561
1562         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1563                 return;
1564
1565         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1566         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1567         pmen &= ~DMA_PMEN_EPM;
1568         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1569
1570         /* wait for the protected region status bit to clear */
1571         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1572                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1573
1574         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1575 }
1576
1577 static void iommu_enable_translation(struct intel_iommu *iommu)
1578 {
1579         u32 sts;
1580         unsigned long flags;
1581
1582         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1583         iommu->gcmd |= DMA_GCMD_TE;
1584         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1585
1586         /* Make sure hardware complete it */
1587         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1588                       readl, (sts & DMA_GSTS_TES), sts);
1589
1590         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1591 }
1592
1593 static void iommu_disable_translation(struct intel_iommu *iommu)
1594 {
1595         u32 sts;
1596         unsigned long flag;
1597
1598         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1599         iommu->gcmd &= ~DMA_GCMD_TE;
1600         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1601
1602         /* Make sure hardware complete it */
1603         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1604                       readl, (!(sts & DMA_GSTS_TES)), sts);
1605
1606         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1607 }
1608
1609 static int iommu_init_domains(struct intel_iommu *iommu)
1610 {
1611         u32 ndomains, nlongs;
1612         size_t size;
1613
1614         ndomains = cap_ndoms(iommu->cap);
1615         pr_debug("%s: Number of Domains supported <%d>\n",
1616                  iommu->name, ndomains);
1617         nlongs = BITS_TO_LONGS(ndomains);
1618
1619         spin_lock_init(&iommu->lock);
1620
1621         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1622         if (!iommu->domain_ids) {
1623                 pr_err("%s: Allocating domain id array failed\n",
1624                        iommu->name);
1625                 return -ENOMEM;
1626         }
1627
1628         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1629         iommu->domains = kzalloc(size, GFP_KERNEL);
1630
1631         if (iommu->domains) {
1632                 size = 256 * sizeof(struct dmar_domain *);
1633                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1634         }
1635
1636         if (!iommu->domains || !iommu->domains[0]) {
1637                 pr_err("%s: Allocating domain array failed\n",
1638                        iommu->name);
1639                 kfree(iommu->domain_ids);
1640                 kfree(iommu->domains);
1641                 iommu->domain_ids = NULL;
1642                 iommu->domains    = NULL;
1643                 return -ENOMEM;
1644         }
1645
1646         /*
1647          * If Caching mode is set, then invalid translations are tagged
1648          * with domain-id 0, hence we need to pre-allocate it. We also
1649          * use domain-id 0 as a marker for non-allocated domain-id, so
1650          * make sure it is not used for a real domain.
1651          */
1652         set_bit(0, iommu->domain_ids);
1653
1654         /*
1655          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1656          * entry for first-level or pass-through translation modes should
1657          * be programmed with a domain id different from those used for
1658          * second-level or nested translation. We reserve a domain id for
1659          * this purpose.
1660          */
1661         if (sm_supported(iommu))
1662                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1663
1664         return 0;
1665 }
1666
1667 static void disable_dmar_iommu(struct intel_iommu *iommu)
1668 {
1669         struct device_domain_info *info, *tmp;
1670         unsigned long flags;
1671
1672         if (!iommu->domains || !iommu->domain_ids)
1673                 return;
1674
1675         spin_lock_irqsave(&device_domain_lock, flags);
1676         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1677                 if (info->iommu != iommu)
1678                         continue;
1679
1680                 if (!info->dev || !info->domain)
1681                         continue;
1682
1683                 __dmar_remove_one_dev_info(info);
1684         }
1685         spin_unlock_irqrestore(&device_domain_lock, flags);
1686
1687         if (iommu->gcmd & DMA_GCMD_TE)
1688                 iommu_disable_translation(iommu);
1689 }
1690
1691 static void free_dmar_iommu(struct intel_iommu *iommu)
1692 {
1693         if ((iommu->domains) && (iommu->domain_ids)) {
1694                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1695                 int i;
1696
1697                 for (i = 0; i < elems; i++)
1698                         kfree(iommu->domains[i]);
1699                 kfree(iommu->domains);
1700                 kfree(iommu->domain_ids);
1701                 iommu->domains = NULL;
1702                 iommu->domain_ids = NULL;
1703         }
1704
1705         g_iommus[iommu->seq_id] = NULL;
1706
1707         /* free context mapping */
1708         free_context_table(iommu);
1709
1710 #ifdef CONFIG_INTEL_IOMMU_SVM
1711         if (pasid_supported(iommu)) {
1712                 if (ecap_prs(iommu->ecap))
1713                         intel_svm_finish_prq(iommu);
1714         }
1715 #endif
1716 }
1717
1718 static struct dmar_domain *alloc_domain(int flags)
1719 {
1720         struct dmar_domain *domain;
1721
1722         domain = alloc_domain_mem();
1723         if (!domain)
1724                 return NULL;
1725
1726         memset(domain, 0, sizeof(*domain));
1727         domain->nid = NUMA_NO_NODE;
1728         domain->flags = flags;
1729         domain->has_iotlb_device = false;
1730         INIT_LIST_HEAD(&domain->devices);
1731
1732         return domain;
1733 }
1734
1735 /* Must be called with iommu->lock */
1736 static int domain_attach_iommu(struct dmar_domain *domain,
1737                                struct intel_iommu *iommu)
1738 {
1739         unsigned long ndomains;
1740         int num;
1741
1742         assert_spin_locked(&device_domain_lock);
1743         assert_spin_locked(&iommu->lock);
1744
1745         domain->iommu_refcnt[iommu->seq_id] += 1;
1746         domain->iommu_count += 1;
1747         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1748                 ndomains = cap_ndoms(iommu->cap);
1749                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1750
1751                 if (num >= ndomains) {
1752                         pr_err("%s: No free domain ids\n", iommu->name);
1753                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1754                         domain->iommu_count -= 1;
1755                         return -ENOSPC;
1756                 }
1757
1758                 set_bit(num, iommu->domain_ids);
1759                 set_iommu_domain(iommu, num, domain);
1760
1761                 domain->iommu_did[iommu->seq_id] = num;
1762                 domain->nid                      = iommu->node;
1763
1764                 domain_update_iommu_cap(domain);
1765         }
1766
1767         return 0;
1768 }
1769
1770 static int domain_detach_iommu(struct dmar_domain *domain,
1771                                struct intel_iommu *iommu)
1772 {
1773         int num, count;
1774
1775         assert_spin_locked(&device_domain_lock);
1776         assert_spin_locked(&iommu->lock);
1777
1778         domain->iommu_refcnt[iommu->seq_id] -= 1;
1779         count = --domain->iommu_count;
1780         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1781                 num = domain->iommu_did[iommu->seq_id];
1782                 clear_bit(num, iommu->domain_ids);
1783                 set_iommu_domain(iommu, num, NULL);
1784
1785                 domain_update_iommu_cap(domain);
1786                 domain->iommu_did[iommu->seq_id] = 0;
1787         }
1788
1789         return count;
1790 }
1791
1792 static struct iova_domain reserved_iova_list;
1793 static struct lock_class_key reserved_rbtree_key;
1794
1795 static int dmar_init_reserved_ranges(void)
1796 {
1797         struct pci_dev *pdev = NULL;
1798         struct iova *iova;
1799         int i;
1800
1801         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1802
1803         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1804                 &reserved_rbtree_key);
1805
1806         /* IOAPIC ranges shouldn't be accessed by DMA */
1807         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1808                 IOVA_PFN(IOAPIC_RANGE_END));
1809         if (!iova) {
1810                 pr_err("Reserve IOAPIC range failed\n");
1811                 return -ENODEV;
1812         }
1813
1814         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1815         for_each_pci_dev(pdev) {
1816                 struct resource *r;
1817
1818                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1819                         r = &pdev->resource[i];
1820                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1821                                 continue;
1822                         iova = reserve_iova(&reserved_iova_list,
1823                                             IOVA_PFN(r->start),
1824                                             IOVA_PFN(r->end));
1825                         if (!iova) {
1826                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1827                                 return -ENODEV;
1828                         }
1829                 }
1830         }
1831         return 0;
1832 }
1833
1834 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1835 {
1836         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1837 }
1838
1839 static inline int guestwidth_to_adjustwidth(int gaw)
1840 {
1841         int agaw;
1842         int r = (gaw - 12) % 9;
1843
1844         if (r == 0)
1845                 agaw = gaw;
1846         else
1847                 agaw = gaw + 9 - r;
1848         if (agaw > 64)
1849                 agaw = 64;
1850         return agaw;
1851 }
1852
1853 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1854                        int guest_width)
1855 {
1856         int adjust_width, agaw, cap_width;
1857         unsigned long sagaw;
1858         int err;
1859
1860         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1861
1862         err = init_iova_flush_queue(&domain->iovad,
1863                                     iommu_flush_iova, iova_entry_free);
1864         if (err)
1865                 return err;
1866
1867         domain_reserve_special_ranges(domain);
1868
1869         /* calculate AGAW */
1870         cap_width = min_t(int, cap_mgaw(iommu->cap), agaw_to_width(iommu->agaw));
1871         if (guest_width > cap_width)
1872                 guest_width = cap_width;
1873         domain->gaw = guest_width;
1874         adjust_width = guestwidth_to_adjustwidth(guest_width);
1875         agaw = width_to_agaw(adjust_width);
1876         sagaw = cap_sagaw(iommu->cap);
1877         if (!test_bit(agaw, &sagaw)) {
1878                 /* hardware doesn't support it, choose a bigger one */
1879                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1880                 agaw = find_next_bit(&sagaw, 5, agaw);
1881                 if (agaw >= 5)
1882                         return -ENODEV;
1883         }
1884         domain->agaw = agaw;
1885
1886         if (ecap_coherent(iommu->ecap))
1887                 domain->iommu_coherency = 1;
1888         else
1889                 domain->iommu_coherency = 0;
1890
1891         if (ecap_sc_support(iommu->ecap))
1892                 domain->iommu_snooping = 1;
1893         else
1894                 domain->iommu_snooping = 0;
1895
1896         if (intel_iommu_superpage)
1897                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1898         else
1899                 domain->iommu_superpage = 0;
1900
1901         domain->nid = iommu->node;
1902
1903         /* always allocate the top pgd */
1904         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1905         if (!domain->pgd)
1906                 return -ENOMEM;
1907         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1908         return 0;
1909 }
1910
1911 static void domain_exit(struct dmar_domain *domain)
1912 {
1913
1914         /* Remove associated devices and clear attached or cached domains */
1915         domain_remove_dev_info(domain);
1916
1917         /* destroy iovas */
1918         put_iova_domain(&domain->iovad);
1919
1920         if (domain->pgd) {
1921                 struct page *freelist;
1922
1923                 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1924                 dma_free_pagelist(freelist);
1925         }
1926
1927         free_domain_mem(domain);
1928 }
1929
1930 /*
1931  * Get the PASID directory size for scalable mode context entry.
1932  * Value of X in the PDTS field of a scalable mode context entry
1933  * indicates PASID directory with 2^(X + 7) entries.
1934  */
1935 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1936 {
1937         int pds, max_pde;
1938
1939         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1940         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1941         if (pds < 7)
1942                 return 0;
1943
1944         return pds - 7;
1945 }
1946
1947 /*
1948  * Set the RID_PASID field of a scalable mode context entry. The
1949  * IOMMU hardware will use the PASID value set in this field for
1950  * DMA translations of DMA requests without PASID.
1951  */
1952 static inline void
1953 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1954 {
1955         context->hi |= pasid & ((1 << 20) - 1);
1956         context->hi |= (1 << 20);
1957 }
1958
1959 /*
1960  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1961  * entry.
1962  */
1963 static inline void context_set_sm_dte(struct context_entry *context)
1964 {
1965         context->lo |= (1 << 2);
1966 }
1967
1968 /*
1969  * Set the PRE(Page Request Enable) field of a scalable mode context
1970  * entry.
1971  */
1972 static inline void context_set_sm_pre(struct context_entry *context)
1973 {
1974         context->lo |= (1 << 4);
1975 }
1976
1977 /* Convert value to context PASID directory size field coding. */
1978 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1979
1980 static int domain_context_mapping_one(struct dmar_domain *domain,
1981                                       struct intel_iommu *iommu,
1982                                       struct pasid_table *table,
1983                                       u8 bus, u8 devfn)
1984 {
1985         u16 did = domain->iommu_did[iommu->seq_id];
1986         int translation = CONTEXT_TT_MULTI_LEVEL;
1987         struct device_domain_info *info = NULL;
1988         struct context_entry *context;
1989         unsigned long flags;
1990         int ret;
1991
1992         WARN_ON(did == 0);
1993
1994         if (hw_pass_through && domain_type_is_si(domain))
1995                 translation = CONTEXT_TT_PASS_THROUGH;
1996
1997         pr_debug("Set context mapping for %02x:%02x.%d\n",
1998                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1999
2000         BUG_ON(!domain->pgd);
2001
2002         spin_lock_irqsave(&device_domain_lock, flags);
2003         spin_lock(&iommu->lock);
2004
2005         ret = -ENOMEM;
2006         context = iommu_context_addr(iommu, bus, devfn, 1);
2007         if (!context)
2008                 goto out_unlock;
2009
2010         ret = 0;
2011         if (context_present(context))
2012                 goto out_unlock;
2013
2014         /*
2015          * For kdump cases, old valid entries may be cached due to the
2016          * in-flight DMA and copied pgtable, but there is no unmapping
2017          * behaviour for them, thus we need an explicit cache flush for
2018          * the newly-mapped device. For kdump, at this point, the device
2019          * is supposed to finish reset at its driver probe stage, so no
2020          * in-flight DMA will exist, and we don't need to worry anymore
2021          * hereafter.
2022          */
2023         if (context_copied(context)) {
2024                 u16 did_old = context_domain_id(context);
2025
2026                 if (did_old < cap_ndoms(iommu->cap)) {
2027                         iommu->flush.flush_context(iommu, did_old,
2028                                                    (((u16)bus) << 8) | devfn,
2029                                                    DMA_CCMD_MASK_NOBIT,
2030                                                    DMA_CCMD_DEVICE_INVL);
2031                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2032                                                  DMA_TLB_DSI_FLUSH);
2033                 }
2034         }
2035
2036         context_clear_entry(context);
2037
2038         if (sm_supported(iommu)) {
2039                 unsigned long pds;
2040
2041                 WARN_ON(!table);
2042
2043                 /* Setup the PASID DIR pointer: */
2044                 pds = context_get_sm_pds(table);
2045                 context->lo = (u64)virt_to_phys(table->table) |
2046                                 context_pdts(pds);
2047
2048                 /* Setup the RID_PASID field: */
2049                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2050
2051                 /*
2052                  * Setup the Device-TLB enable bit and Page request
2053                  * Enable bit:
2054                  */
2055                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2056                 if (info && info->ats_supported)
2057                         context_set_sm_dte(context);
2058                 if (info && info->pri_supported)
2059                         context_set_sm_pre(context);
2060         } else {
2061                 struct dma_pte *pgd = domain->pgd;
2062                 int agaw;
2063
2064                 context_set_domain_id(context, did);
2065
2066                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2067                         /*
2068                          * Skip top levels of page tables for iommu which has
2069                          * less agaw than default. Unnecessary for PT mode.
2070                          */
2071                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2072                                 ret = -ENOMEM;
2073                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2074                                 if (!dma_pte_present(pgd))
2075                                         goto out_unlock;
2076                         }
2077
2078                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2079                         if (info && info->ats_supported)
2080                                 translation = CONTEXT_TT_DEV_IOTLB;
2081                         else
2082                                 translation = CONTEXT_TT_MULTI_LEVEL;
2083
2084                         context_set_address_root(context, virt_to_phys(pgd));
2085                         context_set_address_width(context, agaw);
2086                 } else {
2087                         /*
2088                          * In pass through mode, AW must be programmed to
2089                          * indicate the largest AGAW value supported by
2090                          * hardware. And ASR is ignored by hardware.
2091                          */
2092                         context_set_address_width(context, iommu->msagaw);
2093                 }
2094
2095                 context_set_translation_type(context, translation);
2096         }
2097
2098         context_set_fault_enable(context);
2099         context_set_present(context);
2100         if (!ecap_coherent(iommu->ecap))
2101                 clflush_cache_range(context, sizeof(*context));
2102
2103         /*
2104          * It's a non-present to present mapping. If hardware doesn't cache
2105          * non-present entry we only need to flush the write-buffer. If the
2106          * _does_ cache non-present entries, then it does so in the special
2107          * domain #0, which we have to flush:
2108          */
2109         if (cap_caching_mode(iommu->cap)) {
2110                 iommu->flush.flush_context(iommu, 0,
2111                                            (((u16)bus) << 8) | devfn,
2112                                            DMA_CCMD_MASK_NOBIT,
2113                                            DMA_CCMD_DEVICE_INVL);
2114                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2115         } else {
2116                 iommu_flush_write_buffer(iommu);
2117         }
2118         iommu_enable_dev_iotlb(info);
2119
2120         ret = 0;
2121
2122 out_unlock:
2123         spin_unlock(&iommu->lock);
2124         spin_unlock_irqrestore(&device_domain_lock, flags);
2125
2126         return ret;
2127 }
2128
2129 struct domain_context_mapping_data {
2130         struct dmar_domain *domain;
2131         struct intel_iommu *iommu;
2132         struct pasid_table *table;
2133 };
2134
2135 static int domain_context_mapping_cb(struct pci_dev *pdev,
2136                                      u16 alias, void *opaque)
2137 {
2138         struct domain_context_mapping_data *data = opaque;
2139
2140         return domain_context_mapping_one(data->domain, data->iommu,
2141                                           data->table, PCI_BUS_NUM(alias),
2142                                           alias & 0xff);
2143 }
2144
2145 static int
2146 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2147 {
2148         struct domain_context_mapping_data data;
2149         struct pasid_table *table;
2150         struct intel_iommu *iommu;
2151         u8 bus, devfn;
2152
2153         iommu = device_to_iommu(dev, &bus, &devfn);
2154         if (!iommu)
2155                 return -ENODEV;
2156
2157         table = intel_pasid_get_table(dev);
2158
2159         if (!dev_is_pci(dev))
2160                 return domain_context_mapping_one(domain, iommu, table,
2161                                                   bus, devfn);
2162
2163         data.domain = domain;
2164         data.iommu = iommu;
2165         data.table = table;
2166
2167         return pci_for_each_dma_alias(to_pci_dev(dev),
2168                                       &domain_context_mapping_cb, &data);
2169 }
2170
2171 static int domain_context_mapped_cb(struct pci_dev *pdev,
2172                                     u16 alias, void *opaque)
2173 {
2174         struct intel_iommu *iommu = opaque;
2175
2176         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2177 }
2178
2179 static int domain_context_mapped(struct device *dev)
2180 {
2181         struct intel_iommu *iommu;
2182         u8 bus, devfn;
2183
2184         iommu = device_to_iommu(dev, &bus, &devfn);
2185         if (!iommu)
2186                 return -ENODEV;
2187
2188         if (!dev_is_pci(dev))
2189                 return device_context_mapped(iommu, bus, devfn);
2190
2191         return !pci_for_each_dma_alias(to_pci_dev(dev),
2192                                        domain_context_mapped_cb, iommu);
2193 }
2194
2195 /* Returns a number of VTD pages, but aligned to MM page size */
2196 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2197                                             size_t size)
2198 {
2199         host_addr &= ~PAGE_MASK;
2200         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2201 }
2202
2203 /* Return largest possible superpage level for a given mapping */
2204 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2205                                           unsigned long iov_pfn,
2206                                           unsigned long phy_pfn,
2207                                           unsigned long pages)
2208 {
2209         int support, level = 1;
2210         unsigned long pfnmerge;
2211
2212         support = domain->iommu_superpage;
2213
2214         /* To use a large page, the virtual *and* physical addresses
2215            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2216            of them will mean we have to use smaller pages. So just
2217            merge them and check both at once. */
2218         pfnmerge = iov_pfn | phy_pfn;
2219
2220         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2221                 pages >>= VTD_STRIDE_SHIFT;
2222                 if (!pages)
2223                         break;
2224                 pfnmerge >>= VTD_STRIDE_SHIFT;
2225                 level++;
2226                 support--;
2227         }
2228         return level;
2229 }
2230
2231 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2232                             struct scatterlist *sg, unsigned long phys_pfn,
2233                             unsigned long nr_pages, int prot)
2234 {
2235         struct dma_pte *first_pte = NULL, *pte = NULL;
2236         phys_addr_t uninitialized_var(pteval);
2237         unsigned long sg_res = 0;
2238         unsigned int largepage_lvl = 0;
2239         unsigned long lvl_pages = 0;
2240
2241         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2242
2243         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2244                 return -EINVAL;
2245
2246         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2247
2248         if (!sg) {
2249                 sg_res = nr_pages;
2250                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2251         }
2252
2253         while (nr_pages > 0) {
2254                 uint64_t tmp;
2255
2256                 if (!sg_res) {
2257                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2258
2259                         sg_res = aligned_nrpages(sg->offset, sg->length);
2260                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2261                         sg->dma_length = sg->length;
2262                         pteval = (sg_phys(sg) - pgoff) | prot;
2263                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2264                 }
2265
2266                 if (!pte) {
2267                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2268
2269                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2270                         if (!pte)
2271                                 return -ENOMEM;
2272                         /* It is large page*/
2273                         if (largepage_lvl > 1) {
2274                                 unsigned long nr_superpages, end_pfn;
2275
2276                                 pteval |= DMA_PTE_LARGE_PAGE;
2277                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2278
2279                                 nr_superpages = sg_res / lvl_pages;
2280                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2281
2282                                 /*
2283                                  * Ensure that old small page tables are
2284                                  * removed to make room for superpage(s).
2285                                  * We're adding new large pages, so make sure
2286                                  * we don't remove their parent tables.
2287                                  */
2288                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2289                                                        largepage_lvl + 1);
2290                         } else {
2291                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2292                         }
2293
2294                 }
2295                 /* We don't need lock here, nobody else
2296                  * touches the iova range
2297                  */
2298                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2299                 if (tmp) {
2300                         static int dumps = 5;
2301                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2302                                 iov_pfn, tmp, (unsigned long long)pteval);
2303                         if (dumps) {
2304                                 dumps--;
2305                                 debug_dma_dump_mappings(NULL);
2306                         }
2307                         WARN_ON(1);
2308                 }
2309
2310                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2311
2312                 BUG_ON(nr_pages < lvl_pages);
2313                 BUG_ON(sg_res < lvl_pages);
2314
2315                 nr_pages -= lvl_pages;
2316                 iov_pfn += lvl_pages;
2317                 phys_pfn += lvl_pages;
2318                 pteval += lvl_pages * VTD_PAGE_SIZE;
2319                 sg_res -= lvl_pages;
2320
2321                 /* If the next PTE would be the first in a new page, then we
2322                    need to flush the cache on the entries we've just written.
2323                    And then we'll need to recalculate 'pte', so clear it and
2324                    let it get set again in the if (!pte) block above.
2325
2326                    If we're done (!nr_pages) we need to flush the cache too.
2327
2328                    Also if we've been setting superpages, we may need to
2329                    recalculate 'pte' and switch back to smaller pages for the
2330                    end of the mapping, if the trailing size is not enough to
2331                    use another superpage (i.e. sg_res < lvl_pages). */
2332                 pte++;
2333                 if (!nr_pages || first_pte_in_page(pte) ||
2334                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2335                         domain_flush_cache(domain, first_pte,
2336                                            (void *)pte - (void *)first_pte);
2337                         pte = NULL;
2338                 }
2339
2340                 if (!sg_res && nr_pages)
2341                         sg = sg_next(sg);
2342         }
2343         return 0;
2344 }
2345
2346 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2347                           struct scatterlist *sg, unsigned long phys_pfn,
2348                           unsigned long nr_pages, int prot)
2349 {
2350         int iommu_id, ret;
2351         struct intel_iommu *iommu;
2352
2353         /* Do the real mapping first */
2354         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2355         if (ret)
2356                 return ret;
2357
2358         for_each_domain_iommu(iommu_id, domain) {
2359                 iommu = g_iommus[iommu_id];
2360                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2361         }
2362
2363         return 0;
2364 }
2365
2366 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2367                                     struct scatterlist *sg, unsigned long nr_pages,
2368                                     int prot)
2369 {
2370         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2371 }
2372
2373 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2374                                      unsigned long phys_pfn, unsigned long nr_pages,
2375                                      int prot)
2376 {
2377         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2378 }
2379
2380 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2381 {
2382         unsigned long flags;
2383         struct context_entry *context;
2384         u16 did_old;
2385
2386         if (!iommu)
2387                 return;
2388
2389         spin_lock_irqsave(&iommu->lock, flags);
2390         context = iommu_context_addr(iommu, bus, devfn, 0);
2391         if (!context) {
2392                 spin_unlock_irqrestore(&iommu->lock, flags);
2393                 return;
2394         }
2395         did_old = context_domain_id(context);
2396         context_clear_entry(context);
2397         __iommu_flush_cache(iommu, context, sizeof(*context));
2398         spin_unlock_irqrestore(&iommu->lock, flags);
2399         iommu->flush.flush_context(iommu,
2400                                    did_old,
2401                                    (((u16)bus) << 8) | devfn,
2402                                    DMA_CCMD_MASK_NOBIT,
2403                                    DMA_CCMD_DEVICE_INVL);
2404         iommu->flush.flush_iotlb(iommu,
2405                                  did_old,
2406                                  0,
2407                                  0,
2408                                  DMA_TLB_DSI_FLUSH);
2409 }
2410
2411 static inline void unlink_domain_info(struct device_domain_info *info)
2412 {
2413         assert_spin_locked(&device_domain_lock);
2414         list_del(&info->link);
2415         list_del(&info->global);
2416         if (info->dev)
2417                 info->dev->archdata.iommu = NULL;
2418 }
2419
2420 static void domain_remove_dev_info(struct dmar_domain *domain)
2421 {
2422         struct device_domain_info *info, *tmp;
2423         unsigned long flags;
2424
2425         spin_lock_irqsave(&device_domain_lock, flags);
2426         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2427                 __dmar_remove_one_dev_info(info);
2428         spin_unlock_irqrestore(&device_domain_lock, flags);
2429 }
2430
2431 /*
2432  * find_domain
2433  * Note: we use struct device->archdata.iommu stores the info
2434  */
2435 static struct dmar_domain *find_domain(struct device *dev)
2436 {
2437         struct device_domain_info *info;
2438
2439         if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2440                 struct iommu_domain *domain;
2441
2442                 dev->archdata.iommu = NULL;
2443                 domain = iommu_get_domain_for_dev(dev);
2444                 if (domain)
2445                         intel_iommu_attach_device(domain, dev);
2446         }
2447
2448         /* No lock here, assumes no domain exit in normal case */
2449         info = dev->archdata.iommu;
2450
2451         if (likely(info))
2452                 return info->domain;
2453         return NULL;
2454 }
2455
2456 static inline struct device_domain_info *
2457 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2458 {
2459         struct device_domain_info *info;
2460
2461         list_for_each_entry(info, &device_domain_list, global)
2462                 if (info->iommu->segment == segment && info->bus == bus &&
2463                     info->devfn == devfn)
2464                         return info;
2465
2466         return NULL;
2467 }
2468
2469 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2470                                                     int bus, int devfn,
2471                                                     struct device *dev,
2472                                                     struct dmar_domain *domain)
2473 {
2474         struct dmar_domain *found = NULL;
2475         struct device_domain_info *info;
2476         unsigned long flags;
2477         int ret;
2478
2479         info = alloc_devinfo_mem();
2480         if (!info)
2481                 return NULL;
2482
2483         info->bus = bus;
2484         info->devfn = devfn;
2485         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2486         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2487         info->ats_qdep = 0;
2488         info->dev = dev;
2489         info->domain = domain;
2490         info->iommu = iommu;
2491         info->pasid_table = NULL;
2492         info->auxd_enabled = 0;
2493         INIT_LIST_HEAD(&info->auxiliary_domains);
2494
2495         if (dev && dev_is_pci(dev)) {
2496                 struct pci_dev *pdev = to_pci_dev(info->dev);
2497
2498                 if (!pdev->untrusted &&
2499                     !pci_ats_disabled() &&
2500                     ecap_dev_iotlb_support(iommu->ecap) &&
2501                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2502                     dmar_find_matched_atsr_unit(pdev))
2503                         info->ats_supported = 1;
2504
2505                 if (sm_supported(iommu)) {
2506                         if (pasid_supported(iommu)) {
2507                                 int features = pci_pasid_features(pdev);
2508                                 if (features >= 0)
2509                                         info->pasid_supported = features | 1;
2510                         }
2511
2512                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2513                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2514                                 info->pri_supported = 1;
2515                 }
2516         }
2517
2518         spin_lock_irqsave(&device_domain_lock, flags);
2519         if (dev)
2520                 found = find_domain(dev);
2521
2522         if (!found) {
2523                 struct device_domain_info *info2;
2524                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2525                 if (info2) {
2526                         found      = info2->domain;
2527                         info2->dev = dev;
2528                 }
2529         }
2530
2531         if (found) {
2532                 spin_unlock_irqrestore(&device_domain_lock, flags);
2533                 free_devinfo_mem(info);
2534                 /* Caller must free the original domain */
2535                 return found;
2536         }
2537
2538         spin_lock(&iommu->lock);
2539         ret = domain_attach_iommu(domain, iommu);
2540         spin_unlock(&iommu->lock);
2541
2542         if (ret) {
2543                 spin_unlock_irqrestore(&device_domain_lock, flags);
2544                 free_devinfo_mem(info);
2545                 return NULL;
2546         }
2547
2548         list_add(&info->link, &domain->devices);
2549         list_add(&info->global, &device_domain_list);
2550         if (dev)
2551                 dev->archdata.iommu = info;
2552         spin_unlock_irqrestore(&device_domain_lock, flags);
2553
2554         /* PASID table is mandatory for a PCI device in scalable mode. */
2555         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2556                 ret = intel_pasid_alloc_table(dev);
2557                 if (ret) {
2558                         dev_err(dev, "PASID table allocation failed\n");
2559                         dmar_remove_one_dev_info(dev);
2560                         return NULL;
2561                 }
2562
2563                 /* Setup the PASID entry for requests without PASID: */
2564                 spin_lock_irqsave(&iommu->lock, flags);
2565                 if (hw_pass_through && domain_type_is_si(domain))
2566                         ret = intel_pasid_setup_pass_through(iommu, domain,
2567                                         dev, PASID_RID2PASID);
2568                 else
2569                         ret = intel_pasid_setup_second_level(iommu, domain,
2570                                         dev, PASID_RID2PASID);
2571                 spin_unlock_irqrestore(&iommu->lock, flags);
2572                 if (ret) {
2573                         dev_err(dev, "Setup RID2PASID failed\n");
2574                         dmar_remove_one_dev_info(dev);
2575                         return NULL;
2576                 }
2577         }
2578
2579         if (dev && domain_context_mapping(domain, dev)) {
2580                 dev_err(dev, "Domain context map failed\n");
2581                 dmar_remove_one_dev_info(dev);
2582                 return NULL;
2583         }
2584
2585         return domain;
2586 }
2587
2588 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2589 {
2590         *(u16 *)opaque = alias;
2591         return 0;
2592 }
2593
2594 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2595 {
2596         struct device_domain_info *info;
2597         struct dmar_domain *domain = NULL;
2598         struct intel_iommu *iommu;
2599         u16 dma_alias;
2600         unsigned long flags;
2601         u8 bus, devfn;
2602
2603         iommu = device_to_iommu(dev, &bus, &devfn);
2604         if (!iommu)
2605                 return NULL;
2606
2607         if (dev_is_pci(dev)) {
2608                 struct pci_dev *pdev = to_pci_dev(dev);
2609
2610                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2611
2612                 spin_lock_irqsave(&device_domain_lock, flags);
2613                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2614                                                       PCI_BUS_NUM(dma_alias),
2615                                                       dma_alias & 0xff);
2616                 if (info) {
2617                         iommu = info->iommu;
2618                         domain = info->domain;
2619                 }
2620                 spin_unlock_irqrestore(&device_domain_lock, flags);
2621
2622                 /* DMA alias already has a domain, use it */
2623                 if (info)
2624                         goto out;
2625         }
2626
2627         /* Allocate and initialize new domain for the device */
2628         domain = alloc_domain(0);
2629         if (!domain)
2630                 return NULL;
2631         if (domain_init(domain, iommu, gaw)) {
2632                 domain_exit(domain);
2633                 return NULL;
2634         }
2635
2636 out:
2637         return domain;
2638 }
2639
2640 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2641                                               struct dmar_domain *domain)
2642 {
2643         struct intel_iommu *iommu;
2644         struct dmar_domain *tmp;
2645         u16 req_id, dma_alias;
2646         u8 bus, devfn;
2647
2648         iommu = device_to_iommu(dev, &bus, &devfn);
2649         if (!iommu)
2650                 return NULL;
2651
2652         req_id = ((u16)bus << 8) | devfn;
2653
2654         if (dev_is_pci(dev)) {
2655                 struct pci_dev *pdev = to_pci_dev(dev);
2656
2657                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2658
2659                 /* register PCI DMA alias device */
2660                 if (req_id != dma_alias) {
2661                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2662                                         dma_alias & 0xff, NULL, domain);
2663
2664                         if (!tmp || tmp != domain)
2665                                 return tmp;
2666                 }
2667         }
2668
2669         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2670         if (!tmp || tmp != domain)
2671                 return tmp;
2672
2673         return domain;
2674 }
2675
2676 static int iommu_domain_identity_map(struct dmar_domain *domain,
2677                                      unsigned long long start,
2678                                      unsigned long long end)
2679 {
2680         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2681         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2682
2683         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2684                           dma_to_mm_pfn(last_vpfn))) {
2685                 pr_err("Reserving iova failed\n");
2686                 return -ENOMEM;
2687         }
2688
2689         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2690         /*
2691          * RMRR range might have overlap with physical memory range,
2692          * clear it first
2693          */
2694         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2695
2696         return __domain_mapping(domain, first_vpfn, NULL,
2697                                 first_vpfn, last_vpfn - first_vpfn + 1,
2698                                 DMA_PTE_READ|DMA_PTE_WRITE);
2699 }
2700
2701 static int domain_prepare_identity_map(struct device *dev,
2702                                        struct dmar_domain *domain,
2703                                        unsigned long long start,
2704                                        unsigned long long end)
2705 {
2706         /* For _hardware_ passthrough, don't bother. But for software
2707            passthrough, we do it anyway -- it may indicate a memory
2708            range which is reserved in E820, so which didn't get set
2709            up to start with in si_domain */
2710         if (domain == si_domain && hw_pass_through) {
2711                 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2712                          start, end);
2713                 return 0;
2714         }
2715
2716         dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2717
2718         if (end < start) {
2719                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2720                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2721                         dmi_get_system_info(DMI_BIOS_VENDOR),
2722                         dmi_get_system_info(DMI_BIOS_VERSION),
2723                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2724                 return -EIO;
2725         }
2726
2727         if (end >> agaw_to_width(domain->agaw)) {
2728                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2729                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2730                      agaw_to_width(domain->agaw),
2731                      dmi_get_system_info(DMI_BIOS_VENDOR),
2732                      dmi_get_system_info(DMI_BIOS_VERSION),
2733                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2734                 return -EIO;
2735         }
2736
2737         return iommu_domain_identity_map(domain, start, end);
2738 }
2739
2740 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2741
2742 static int __init si_domain_init(int hw)
2743 {
2744         struct dmar_rmrr_unit *rmrr;
2745         struct device *dev;
2746         int i, nid, ret;
2747
2748         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2749         if (!si_domain)
2750                 return -EFAULT;
2751
2752         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2753                 domain_exit(si_domain);
2754                 si_domain = NULL;
2755                 return -EFAULT;
2756         }
2757
2758         if (hw)
2759                 return 0;
2760
2761         for_each_online_node(nid) {
2762                 unsigned long start_pfn, end_pfn;
2763                 int i;
2764
2765                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2766                         ret = iommu_domain_identity_map(si_domain,
2767                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2768                         if (ret)
2769                                 return ret;
2770                 }
2771         }
2772
2773         /*
2774          * Identity map the RMRRs so that devices with RMRRs could also use
2775          * the si_domain.
2776          */
2777         for_each_rmrr_units(rmrr) {
2778                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2779                                           i, dev) {
2780                         unsigned long long start = rmrr->base_address;
2781                         unsigned long long end = rmrr->end_address;
2782
2783                         if (WARN_ON(end < start ||
2784                                     end >> agaw_to_width(si_domain->agaw)))
2785                                 continue;
2786
2787                         ret = iommu_domain_identity_map(si_domain, start, end);
2788                         if (ret)
2789                                 return ret;
2790                 }
2791         }
2792
2793         return 0;
2794 }
2795
2796 static int identity_mapping(struct device *dev)
2797 {
2798         struct device_domain_info *info;
2799
2800         info = dev->archdata.iommu;
2801         if (info && info != DUMMY_DEVICE_DOMAIN_INFO && info != DEFER_DEVICE_DOMAIN_INFO)
2802                 return (info->domain == si_domain);
2803
2804         return 0;
2805 }
2806
2807 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2808 {
2809         struct dmar_domain *ndomain;
2810         struct intel_iommu *iommu;
2811         u8 bus, devfn;
2812
2813         iommu = device_to_iommu(dev, &bus, &devfn);
2814         if (!iommu)
2815                 return -ENODEV;
2816
2817         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2818         if (ndomain != domain)
2819                 return -EBUSY;
2820
2821         return 0;
2822 }
2823
2824 static bool device_has_rmrr(struct device *dev)
2825 {
2826         struct dmar_rmrr_unit *rmrr;
2827         struct device *tmp;
2828         int i;
2829
2830         rcu_read_lock();
2831         for_each_rmrr_units(rmrr) {
2832                 /*
2833                  * Return TRUE if this RMRR contains the device that
2834                  * is passed in.
2835                  */
2836                 for_each_active_dev_scope(rmrr->devices,
2837                                           rmrr->devices_cnt, i, tmp)
2838                         if (tmp == dev ||
2839                             is_downstream_to_pci_bridge(dev, tmp)) {
2840                                 rcu_read_unlock();
2841                                 return true;
2842                         }
2843         }
2844         rcu_read_unlock();
2845         return false;
2846 }
2847
2848 /**
2849  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2850  * is relaxable (ie. is allowed to be not enforced under some conditions)
2851  * @dev: device handle
2852  *
2853  * We assume that PCI USB devices with RMRRs have them largely
2854  * for historical reasons and that the RMRR space is not actively used post
2855  * boot.  This exclusion may change if vendors begin to abuse it.
2856  *
2857  * The same exception is made for graphics devices, with the requirement that
2858  * any use of the RMRR regions will be torn down before assigning the device
2859  * to a guest.
2860  *
2861  * Return: true if the RMRR is relaxable, false otherwise
2862  */
2863 static bool device_rmrr_is_relaxable(struct device *dev)
2864 {
2865         struct pci_dev *pdev;
2866
2867         if (!dev_is_pci(dev))
2868                 return false;
2869
2870         pdev = to_pci_dev(dev);
2871         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2872                 return true;
2873         else
2874                 return false;
2875 }
2876
2877 /*
2878  * There are a couple cases where we need to restrict the functionality of
2879  * devices associated with RMRRs.  The first is when evaluating a device for
2880  * identity mapping because problems exist when devices are moved in and out
2881  * of domains and their respective RMRR information is lost.  This means that
2882  * a device with associated RMRRs will never be in a "passthrough" domain.
2883  * The second is use of the device through the IOMMU API.  This interface
2884  * expects to have full control of the IOVA space for the device.  We cannot
2885  * satisfy both the requirement that RMRR access is maintained and have an
2886  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2887  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2888  * We therefore prevent devices associated with an RMRR from participating in
2889  * the IOMMU API, which eliminates them from device assignment.
2890  *
2891  * In both cases, devices which have relaxable RMRRs are not concerned by this
2892  * restriction. See device_rmrr_is_relaxable comment.
2893  */
2894 static bool device_is_rmrr_locked(struct device *dev)
2895 {
2896         if (!device_has_rmrr(dev))
2897                 return false;
2898
2899         if (device_rmrr_is_relaxable(dev))
2900                 return false;
2901
2902         return true;
2903 }
2904
2905 /*
2906  * Return the required default domain type for a specific device.
2907  *
2908  * @dev: the device in query
2909  * @startup: true if this is during early boot
2910  *
2911  * Returns:
2912  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2913  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2914  *  - 0: both identity and dynamic domains work for this device
2915  */
2916 static int device_def_domain_type(struct device *dev)
2917 {
2918         if (dev_is_pci(dev)) {
2919                 struct pci_dev *pdev = to_pci_dev(dev);
2920
2921                 /*
2922                  * Prevent any device marked as untrusted from getting
2923                  * placed into the statically identity mapping domain.
2924                  */
2925                 if (pdev->untrusted)
2926                         return IOMMU_DOMAIN_DMA;
2927
2928                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2929                         return IOMMU_DOMAIN_IDENTITY;
2930
2931                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2932                         return IOMMU_DOMAIN_IDENTITY;
2933
2934                 /*
2935                  * We want to start off with all devices in the 1:1 domain, and
2936                  * take them out later if we find they can't access all of memory.
2937                  *
2938                  * However, we can't do this for PCI devices behind bridges,
2939                  * because all PCI devices behind the same bridge will end up
2940                  * with the same source-id on their transactions.
2941                  *
2942                  * Practically speaking, we can't change things around for these
2943                  * devices at run-time, because we can't be sure there'll be no
2944                  * DMA transactions in flight for any of their siblings.
2945                  *
2946                  * So PCI devices (unless they're on the root bus) as well as
2947                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2948                  * the 1:1 domain, just in _case_ one of their siblings turns out
2949                  * not to be able to map all of memory.
2950                  */
2951                 if (!pci_is_pcie(pdev)) {
2952                         if (!pci_is_root_bus(pdev->bus))
2953                                 return IOMMU_DOMAIN_DMA;
2954                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2955                                 return IOMMU_DOMAIN_DMA;
2956                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2957                         return IOMMU_DOMAIN_DMA;
2958         }
2959
2960         return (iommu_identity_mapping & IDENTMAP_ALL) ?
2961                         IOMMU_DOMAIN_IDENTITY : 0;
2962 }
2963
2964 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2965 {
2966         /*
2967          * Start from the sane iommu hardware state.
2968          * If the queued invalidation is already initialized by us
2969          * (for example, while enabling interrupt-remapping) then
2970          * we got the things already rolling from a sane state.
2971          */
2972         if (!iommu->qi) {
2973                 /*
2974                  * Clear any previous faults.
2975                  */
2976                 dmar_fault(-1, iommu);
2977                 /*
2978                  * Disable queued invalidation if supported and already enabled
2979                  * before OS handover.
2980                  */
2981                 dmar_disable_qi(iommu);
2982         }
2983
2984         if (dmar_enable_qi(iommu)) {
2985                 /*
2986                  * Queued Invalidate not enabled, use Register Based Invalidate
2987                  */
2988                 iommu->flush.flush_context = __iommu_flush_context;
2989                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2990                 pr_info("%s: Using Register based invalidation\n",
2991                         iommu->name);
2992         } else {
2993                 iommu->flush.flush_context = qi_flush_context;
2994                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2995                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2996         }
2997 }
2998
2999 static int copy_context_table(struct intel_iommu *iommu,
3000                               struct root_entry *old_re,
3001                               struct context_entry **tbl,
3002                               int bus, bool ext)
3003 {
3004         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3005         struct context_entry *new_ce = NULL, ce;
3006         struct context_entry *old_ce = NULL;
3007         struct root_entry re;
3008         phys_addr_t old_ce_phys;
3009
3010         tbl_idx = ext ? bus * 2 : bus;
3011         memcpy(&re, old_re, sizeof(re));
3012
3013         for (devfn = 0; devfn < 256; devfn++) {
3014                 /* First calculate the correct index */
3015                 idx = (ext ? devfn * 2 : devfn) % 256;
3016
3017                 if (idx == 0) {
3018                         /* First save what we may have and clean up */
3019                         if (new_ce) {
3020                                 tbl[tbl_idx] = new_ce;
3021                                 __iommu_flush_cache(iommu, new_ce,
3022                                                     VTD_PAGE_SIZE);
3023                                 pos = 1;
3024                         }
3025
3026                         if (old_ce)
3027                                 memunmap(old_ce);
3028
3029                         ret = 0;
3030                         if (devfn < 0x80)
3031                                 old_ce_phys = root_entry_lctp(&re);
3032                         else
3033                                 old_ce_phys = root_entry_uctp(&re);
3034
3035                         if (!old_ce_phys) {
3036                                 if (ext && devfn == 0) {
3037                                         /* No LCTP, try UCTP */
3038                                         devfn = 0x7f;
3039                                         continue;
3040                                 } else {
3041                                         goto out;
3042                                 }
3043                         }
3044
3045                         ret = -ENOMEM;
3046                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3047                                         MEMREMAP_WB);
3048                         if (!old_ce)
3049                                 goto out;
3050
3051                         new_ce = alloc_pgtable_page(iommu->node);
3052                         if (!new_ce)
3053                                 goto out_unmap;
3054
3055                         ret = 0;
3056                 }
3057
3058                 /* Now copy the context entry */
3059                 memcpy(&ce, old_ce + idx, sizeof(ce));
3060
3061                 if (!__context_present(&ce))
3062                         continue;
3063
3064                 did = context_domain_id(&ce);
3065                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3066                         set_bit(did, iommu->domain_ids);
3067
3068                 /*
3069                  * We need a marker for copied context entries. This
3070                  * marker needs to work for the old format as well as
3071                  * for extended context entries.
3072                  *
3073                  * Bit 67 of the context entry is used. In the old
3074                  * format this bit is available to software, in the
3075                  * extended format it is the PGE bit, but PGE is ignored
3076                  * by HW if PASIDs are disabled (and thus still
3077                  * available).
3078                  *
3079                  * So disable PASIDs first and then mark the entry
3080                  * copied. This means that we don't copy PASID
3081                  * translations from the old kernel, but this is fine as
3082                  * faults there are not fatal.
3083                  */
3084                 context_clear_pasid_enable(&ce);
3085                 context_set_copied(&ce);
3086
3087                 new_ce[idx] = ce;
3088         }
3089
3090         tbl[tbl_idx + pos] = new_ce;
3091
3092         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3093
3094 out_unmap:
3095         memunmap(old_ce);
3096
3097 out:
3098         return ret;
3099 }
3100
3101 static int copy_translation_tables(struct intel_iommu *iommu)
3102 {
3103         struct context_entry **ctxt_tbls;
3104         struct root_entry *old_rt;
3105         phys_addr_t old_rt_phys;
3106         int ctxt_table_entries;
3107         unsigned long flags;
3108         u64 rtaddr_reg;
3109         int bus, ret;
3110         bool new_ext, ext;
3111
3112         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3113         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3114         new_ext    = !!ecap_ecs(iommu->ecap);
3115
3116         /*
3117          * The RTT bit can only be changed when translation is disabled,
3118          * but disabling translation means to open a window for data
3119          * corruption. So bail out and don't copy anything if we would
3120          * have to change the bit.
3121          */
3122         if (new_ext != ext)
3123                 return -EINVAL;
3124
3125         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3126         if (!old_rt_phys)
3127                 return -EINVAL;
3128
3129         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3130         if (!old_rt)
3131                 return -ENOMEM;
3132
3133         /* This is too big for the stack - allocate it from slab */
3134         ctxt_table_entries = ext ? 512 : 256;
3135         ret = -ENOMEM;
3136         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3137         if (!ctxt_tbls)
3138                 goto out_unmap;
3139
3140         for (bus = 0; bus < 256; bus++) {
3141                 ret = copy_context_table(iommu, &old_rt[bus],
3142                                          ctxt_tbls, bus, ext);
3143                 if (ret) {
3144                         pr_err("%s: Failed to copy context table for bus %d\n",
3145                                 iommu->name, bus);
3146                         continue;
3147                 }
3148         }
3149
3150         spin_lock_irqsave(&iommu->lock, flags);
3151
3152         /* Context tables are copied, now write them to the root_entry table */
3153         for (bus = 0; bus < 256; bus++) {
3154                 int idx = ext ? bus * 2 : bus;
3155                 u64 val;
3156
3157                 if (ctxt_tbls[idx]) {
3158                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3159                         iommu->root_entry[bus].lo = val;
3160                 }
3161
3162                 if (!ext || !ctxt_tbls[idx + 1])
3163                         continue;
3164
3165                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3166                 iommu->root_entry[bus].hi = val;
3167         }
3168
3169         spin_unlock_irqrestore(&iommu->lock, flags);
3170
3171         kfree(ctxt_tbls);
3172
3173         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3174
3175         ret = 0;
3176
3177 out_unmap:
3178         memunmap(old_rt);
3179
3180         return ret;
3181 }
3182
3183 static int __init init_dmars(void)
3184 {
3185         struct dmar_drhd_unit *drhd;
3186         struct intel_iommu *iommu;
3187         int ret;
3188
3189         /*
3190          * for each drhd
3191          *    allocate root
3192          *    initialize and program root entry to not present
3193          * endfor
3194          */
3195         for_each_drhd_unit(drhd) {
3196                 /*
3197                  * lock not needed as this is only incremented in the single
3198                  * threaded kernel __init code path all other access are read
3199                  * only
3200                  */
3201                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3202                         g_num_of_iommus++;
3203                         continue;
3204                 }
3205                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3206         }
3207
3208         /* Preallocate enough resources for IOMMU hot-addition */
3209         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3210                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3211
3212         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3213                         GFP_KERNEL);
3214         if (!g_iommus) {
3215                 pr_err("Allocating global iommu array failed\n");
3216                 ret = -ENOMEM;
3217                 goto error;
3218         }
3219
3220         for_each_iommu(iommu, drhd) {
3221                 if (drhd->ignored) {
3222                         iommu_disable_translation(iommu);
3223                         continue;
3224                 }
3225
3226                 /*
3227                  * Find the max pasid size of all IOMMU's in the system.
3228                  * We need to ensure the system pasid table is no bigger
3229                  * than the smallest supported.
3230                  */
3231                 if (pasid_supported(iommu)) {
3232                         u32 temp = 2 << ecap_pss(iommu->ecap);
3233
3234                         intel_pasid_max_id = min_t(u32, temp,
3235                                                    intel_pasid_max_id);
3236                 }
3237
3238                 g_iommus[iommu->seq_id] = iommu;
3239
3240                 intel_iommu_init_qi(iommu);
3241
3242                 ret = iommu_init_domains(iommu);
3243                 if (ret)
3244                         goto free_iommu;
3245
3246                 init_translation_status(iommu);
3247
3248                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3249                         iommu_disable_translation(iommu);
3250                         clear_translation_pre_enabled(iommu);
3251                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3252                                 iommu->name);
3253                 }
3254
3255                 /*
3256                  * TBD:
3257                  * we could share the same root & context tables
3258                  * among all IOMMU's. Need to Split it later.
3259                  */
3260                 ret = iommu_alloc_root_entry(iommu);
3261                 if (ret)
3262                         goto free_iommu;
3263
3264                 if (translation_pre_enabled(iommu)) {
3265                         pr_info("Translation already enabled - trying to copy translation structures\n");
3266
3267                         ret = copy_translation_tables(iommu);
3268                         if (ret) {
3269                                 /*
3270                                  * We found the IOMMU with translation
3271                                  * enabled - but failed to copy over the
3272                                  * old root-entry table. Try to proceed
3273                                  * by disabling translation now and
3274                                  * allocating a clean root-entry table.
3275                                  * This might cause DMAR faults, but
3276                                  * probably the dump will still succeed.
3277                                  */
3278                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3279                                        iommu->name);
3280                                 iommu_disable_translation(iommu);
3281                                 clear_translation_pre_enabled(iommu);
3282                         } else {
3283                                 pr_info("Copied translation tables from previous kernel for %s\n",
3284                                         iommu->name);
3285                         }
3286                 }
3287
3288                 if (!ecap_pass_through(iommu->ecap))
3289                         hw_pass_through = 0;
3290
3291                 if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) {
3292                         pr_info("Disable batched IOTLB flush due to virtualization");
3293                         intel_iommu_strict = 1;
3294                 }
3295
3296 #ifdef CONFIG_INTEL_IOMMU_SVM
3297                 if (pasid_supported(iommu))
3298                         intel_svm_init(iommu);
3299 #endif
3300         }
3301
3302         /*
3303          * Now that qi is enabled on all iommus, set the root entry and flush
3304          * caches. This is required on some Intel X58 chipsets, otherwise the
3305          * flush_context function will loop forever and the boot hangs.
3306          */
3307         for_each_active_iommu(iommu, drhd) {
3308                 iommu_flush_write_buffer(iommu);
3309                 iommu_set_root_entry(iommu);
3310                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3311                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3312         }
3313
3314         if (iommu_default_passthrough())
3315                 iommu_identity_mapping |= IDENTMAP_ALL;
3316
3317 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3318         dmar_map_gfx = 0;
3319 #endif
3320
3321         if (!dmar_map_gfx)
3322                 iommu_identity_mapping |= IDENTMAP_GFX;
3323
3324         check_tylersburg_isoch();
3325
3326         ret = si_domain_init(hw_pass_through);
3327         if (ret)
3328                 goto free_iommu;
3329
3330         /*
3331          * for each drhd
3332          *   enable fault log
3333          *   global invalidate context cache
3334          *   global invalidate iotlb
3335          *   enable translation
3336          */
3337         for_each_iommu(iommu, drhd) {
3338                 if (drhd->ignored) {
3339                         /*
3340                          * we always have to disable PMRs or DMA may fail on
3341                          * this device
3342                          */
3343                         if (force_on)
3344                                 iommu_disable_protect_mem_regions(iommu);
3345                         continue;
3346                 }
3347
3348                 iommu_flush_write_buffer(iommu);
3349
3350 #ifdef CONFIG_INTEL_IOMMU_SVM
3351                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3352                         /*
3353                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3354                          * could cause possible lock race condition.
3355                          */
3356                         up_write(&dmar_global_lock);
3357                         ret = intel_svm_enable_prq(iommu);
3358                         down_write(&dmar_global_lock);
3359                         if (ret)
3360                                 goto free_iommu;
3361                 }
3362 #endif
3363                 ret = dmar_set_interrupt(iommu);
3364                 if (ret)
3365                         goto free_iommu;
3366         }
3367
3368         return 0;
3369
3370 free_iommu:
3371         for_each_active_iommu(iommu, drhd) {
3372                 disable_dmar_iommu(iommu);
3373                 free_dmar_iommu(iommu);
3374         }
3375         if (si_domain) {
3376                 domain_exit(si_domain);
3377                 si_domain = NULL;
3378         }
3379
3380         kfree(g_iommus);
3381
3382 error:
3383         return ret;
3384 }
3385
3386 /* This takes a number of _MM_ pages, not VTD pages */
3387 static unsigned long intel_alloc_iova(struct device *dev,
3388                                      struct dmar_domain *domain,
3389                                      unsigned long nrpages, uint64_t dma_mask)
3390 {
3391         unsigned long iova_pfn;
3392
3393         /* Restrict dma_mask to the width that the iommu can handle */
3394         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3395         /* Ensure we reserve the whole size-aligned region */
3396         nrpages = __roundup_pow_of_two(nrpages);
3397
3398         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3399                 /*
3400                  * First try to allocate an io virtual address in
3401                  * DMA_BIT_MASK(32) and if that fails then try allocating
3402                  * from higher range
3403                  */
3404                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3405                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3406                 if (iova_pfn)
3407                         return iova_pfn;
3408         }
3409         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3410                                    IOVA_PFN(dma_mask), true);
3411         if (unlikely(!iova_pfn)) {
3412                 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3413                              nrpages);
3414                 return 0;
3415         }
3416
3417         return iova_pfn;
3418 }
3419
3420 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3421 {
3422         struct dmar_domain *domain, *tmp;
3423         struct dmar_rmrr_unit *rmrr;
3424         struct device *i_dev;
3425         int i, ret;
3426
3427         /* Device shouldn't be attached by any domains. */
3428         domain = find_domain(dev);
3429         if (domain)
3430                 return NULL;
3431
3432         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3433         if (!domain)
3434                 goto out;
3435
3436         /* We have a new domain - setup possible RMRRs for the device */
3437         rcu_read_lock();
3438         for_each_rmrr_units(rmrr) {
3439                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3440                                           i, i_dev) {
3441                         if (i_dev != dev)
3442                                 continue;
3443
3444                         ret = domain_prepare_identity_map(dev, domain,
3445                                                           rmrr->base_address,
3446                                                           rmrr->end_address);
3447                         if (ret)
3448                                 dev_err(dev, "Mapping reserved region failed\n");
3449                 }
3450         }
3451         rcu_read_unlock();
3452
3453         tmp = set_domain_for_dev(dev, domain);
3454         if (!tmp || domain != tmp) {
3455                 domain_exit(domain);
3456                 domain = tmp;
3457         }
3458
3459 out:
3460         if (!domain)
3461                 dev_err(dev, "Allocating domain failed\n");
3462         else
3463                 domain->domain.type = IOMMU_DOMAIN_DMA;
3464
3465         return domain;
3466 }
3467
3468 /* Check if the dev needs to go through non-identity map and unmap process.*/
3469 static bool iommu_need_mapping(struct device *dev)
3470 {
3471         int ret;
3472
3473         if (iommu_dummy(dev))
3474                 return false;
3475
3476         ret = identity_mapping(dev);
3477         if (ret) {
3478                 u64 dma_mask = *dev->dma_mask;
3479
3480                 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3481                         dma_mask = dev->coherent_dma_mask;
3482
3483                 if (dma_mask >= dma_direct_get_required_mask(dev))
3484                         return false;
3485
3486                 /*
3487                  * 32 bit DMA is removed from si_domain and fall back to
3488                  * non-identity mapping.
3489                  */
3490                 dmar_remove_one_dev_info(dev);
3491                 ret = iommu_request_dma_domain_for_dev(dev);
3492                 if (ret) {
3493                         struct iommu_domain *domain;
3494                         struct dmar_domain *dmar_domain;
3495
3496                         domain = iommu_get_domain_for_dev(dev);
3497                         if (domain) {
3498                                 dmar_domain = to_dmar_domain(domain);
3499                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3500                         }
3501                         dmar_remove_one_dev_info(dev);
3502                         get_private_domain_for_dev(dev);
3503                 }
3504
3505                 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3506         }
3507
3508         return true;
3509 }
3510
3511 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3512                                      size_t size, int dir, u64 dma_mask)
3513 {
3514         struct dmar_domain *domain;
3515         phys_addr_t start_paddr;
3516         unsigned long iova_pfn;
3517         int prot = 0;
3518         int ret;
3519         struct intel_iommu *iommu;
3520         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3521
3522         BUG_ON(dir == DMA_NONE);
3523
3524         domain = find_domain(dev);
3525         if (!domain)
3526                 return DMA_MAPPING_ERROR;
3527
3528         iommu = domain_get_iommu(domain);
3529         size = aligned_nrpages(paddr, size);
3530
3531         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3532         if (!iova_pfn)
3533                 goto error;
3534
3535         /*
3536          * Check if DMAR supports zero-length reads on write only
3537          * mappings..
3538          */
3539         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3540                         !cap_zlr(iommu->cap))
3541                 prot |= DMA_PTE_READ;
3542         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3543                 prot |= DMA_PTE_WRITE;
3544         /*
3545          * paddr - (paddr + size) might be partial page, we should map the whole
3546          * page.  Note: if two part of one page are separately mapped, we
3547          * might have two guest_addr mapping to the same host paddr, but this
3548          * is not a big problem
3549          */
3550         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3551                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3552         if (ret)
3553                 goto error;
3554
3555         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3556         start_paddr += paddr & ~PAGE_MASK;
3557
3558         trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3559
3560         return start_paddr;
3561
3562 error:
3563         if (iova_pfn)
3564                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3565         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3566                 size, (unsigned long long)paddr, dir);
3567         return DMA_MAPPING_ERROR;
3568 }
3569
3570 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3571                                  unsigned long offset, size_t size,
3572                                  enum dma_data_direction dir,
3573                                  unsigned long attrs)
3574 {
3575         if (iommu_need_mapping(dev))
3576                 return __intel_map_single(dev, page_to_phys(page) + offset,
3577                                 size, dir, *dev->dma_mask);
3578         return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3579 }
3580
3581 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3582                                      size_t size, enum dma_data_direction dir,
3583                                      unsigned long attrs)
3584 {
3585         if (iommu_need_mapping(dev))
3586                 return __intel_map_single(dev, phys_addr, size, dir,
3587                                 *dev->dma_mask);
3588         return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3589 }
3590
3591 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3592 {
3593         struct dmar_domain *domain;
3594         unsigned long start_pfn, last_pfn;
3595         unsigned long nrpages;
3596         unsigned long iova_pfn;
3597         struct intel_iommu *iommu;
3598         struct page *freelist;
3599         struct pci_dev *pdev = NULL;
3600
3601         domain = find_domain(dev);
3602         BUG_ON(!domain);
3603
3604         iommu = domain_get_iommu(domain);
3605
3606         iova_pfn = IOVA_PFN(dev_addr);
3607
3608         nrpages = aligned_nrpages(dev_addr, size);
3609         start_pfn = mm_to_dma_pfn(iova_pfn);
3610         last_pfn = start_pfn + nrpages - 1;
3611
3612         if (dev_is_pci(dev))
3613                 pdev = to_pci_dev(dev);
3614
3615         freelist = domain_unmap(domain, start_pfn, last_pfn);
3616         if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3617                         !has_iova_flush_queue(&domain->iovad)) {
3618                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3619                                       nrpages, !freelist, 0);
3620                 /* free iova */
3621                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3622                 dma_free_pagelist(freelist);
3623         } else {
3624                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3625                            (unsigned long)freelist);
3626                 /*
3627                  * queue up the release of the unmap to save the 1/6th of the
3628                  * cpu used up by the iotlb flush operation...
3629                  */
3630         }
3631
3632         trace_unmap_single(dev, dev_addr, size);
3633 }
3634
3635 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3636                              size_t size, enum dma_data_direction dir,
3637                              unsigned long attrs)
3638 {
3639         if (iommu_need_mapping(dev))
3640                 intel_unmap(dev, dev_addr, size);
3641         else
3642                 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3643 }
3644
3645 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3646                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3647 {
3648         if (iommu_need_mapping(dev))
3649                 intel_unmap(dev, dev_addr, size);
3650 }
3651
3652 static void *intel_alloc_coherent(struct device *dev, size_t size,
3653                                   dma_addr_t *dma_handle, gfp_t flags,
3654                                   unsigned long attrs)
3655 {
3656         struct page *page = NULL;
3657         int order;
3658
3659         if (!iommu_need_mapping(dev))
3660                 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3661
3662         size = PAGE_ALIGN(size);
3663         order = get_order(size);
3664
3665         if (gfpflags_allow_blocking(flags)) {
3666                 unsigned int count = size >> PAGE_SHIFT;
3667
3668                 page = dma_alloc_from_contiguous(dev, count, order,
3669                                                  flags & __GFP_NOWARN);
3670         }
3671
3672         if (!page)
3673                 page = alloc_pages(flags, order);
3674         if (!page)
3675                 return NULL;
3676         memset(page_address(page), 0, size);
3677
3678         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3679                                          DMA_BIDIRECTIONAL,
3680                                          dev->coherent_dma_mask);
3681         if (*dma_handle != DMA_MAPPING_ERROR)
3682                 return page_address(page);
3683         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3684                 __free_pages(page, order);
3685
3686         return NULL;
3687 }
3688
3689 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3690                                 dma_addr_t dma_handle, unsigned long attrs)
3691 {
3692         int order;
3693         struct page *page = virt_to_page(vaddr);
3694
3695         if (!iommu_need_mapping(dev))
3696                 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3697
3698         size = PAGE_ALIGN(size);
3699         order = get_order(size);
3700
3701         intel_unmap(dev, dma_handle, size);
3702         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3703                 __free_pages(page, order);
3704 }
3705
3706 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3707                            int nelems, enum dma_data_direction dir,
3708                            unsigned long attrs)
3709 {
3710         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3711         unsigned long nrpages = 0;
3712         struct scatterlist *sg;
3713         int i;
3714
3715         if (!iommu_need_mapping(dev))
3716                 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3717
3718         for_each_sg(sglist, sg, nelems, i) {
3719                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3720         }
3721
3722         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3723
3724         trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3725 }
3726
3727 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3728                         enum dma_data_direction dir, unsigned long attrs)
3729 {
3730         int i;
3731         struct dmar_domain *domain;
3732         size_t size = 0;
3733         int prot = 0;
3734         unsigned long iova_pfn;
3735         int ret;
3736         struct scatterlist *sg;
3737         unsigned long start_vpfn;
3738         struct intel_iommu *iommu;
3739
3740         BUG_ON(dir == DMA_NONE);
3741         if (!iommu_need_mapping(dev))
3742                 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3743
3744         domain = find_domain(dev);
3745         if (!domain)
3746                 return 0;
3747
3748         iommu = domain_get_iommu(domain);
3749
3750         for_each_sg(sglist, sg, nelems, i)
3751                 size += aligned_nrpages(sg->offset, sg->length);
3752
3753         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3754                                 *dev->dma_mask);
3755         if (!iova_pfn) {
3756                 sglist->dma_length = 0;
3757                 return 0;
3758         }
3759
3760         /*
3761          * Check if DMAR supports zero-length reads on write only
3762          * mappings..
3763          */
3764         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3765                         !cap_zlr(iommu->cap))
3766                 prot |= DMA_PTE_READ;
3767         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3768                 prot |= DMA_PTE_WRITE;
3769
3770         start_vpfn = mm_to_dma_pfn(iova_pfn);
3771
3772         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3773         if (unlikely(ret)) {
3774                 dma_pte_free_pagetable(domain, start_vpfn,
3775                                        start_vpfn + size - 1,
3776                                        agaw_to_level(domain->agaw) + 1);
3777                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3778                 return 0;
3779         }
3780
3781         trace_map_sg(dev, iova_pfn << PAGE_SHIFT,
3782                      sg_phys(sglist), size << VTD_PAGE_SHIFT);
3783
3784         return nelems;
3785 }
3786
3787 static u64 intel_get_required_mask(struct device *dev)
3788 {
3789         if (!iommu_need_mapping(dev))
3790                 return dma_direct_get_required_mask(dev);
3791         return DMA_BIT_MASK(32);
3792 }
3793
3794 static const struct dma_map_ops intel_dma_ops = {
3795         .alloc = intel_alloc_coherent,
3796         .free = intel_free_coherent,
3797         .map_sg = intel_map_sg,
3798         .unmap_sg = intel_unmap_sg,
3799         .map_page = intel_map_page,
3800         .unmap_page = intel_unmap_page,
3801         .map_resource = intel_map_resource,
3802         .unmap_resource = intel_unmap_resource,
3803         .dma_supported = dma_direct_supported,
3804         .mmap = dma_common_mmap,
3805         .get_sgtable = dma_common_get_sgtable,
3806         .get_required_mask = intel_get_required_mask,
3807 };
3808
3809 static void
3810 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3811                    enum dma_data_direction dir, enum dma_sync_target target)
3812 {
3813         struct dmar_domain *domain;
3814         phys_addr_t tlb_addr;
3815
3816         domain = find_domain(dev);
3817         if (WARN_ON(!domain))
3818                 return;
3819
3820         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3821         if (is_swiotlb_buffer(tlb_addr))
3822                 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3823 }
3824
3825 static dma_addr_t
3826 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3827                   enum dma_data_direction dir, unsigned long attrs,
3828                   u64 dma_mask)
3829 {
3830         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3831         struct dmar_domain *domain;
3832         struct intel_iommu *iommu;
3833         unsigned long iova_pfn;
3834         unsigned long nrpages;
3835         phys_addr_t tlb_addr;
3836         int prot = 0;
3837         int ret;
3838
3839         domain = find_domain(dev);
3840         if (WARN_ON(dir == DMA_NONE || !domain))
3841                 return DMA_MAPPING_ERROR;
3842
3843         iommu = domain_get_iommu(domain);
3844         if (WARN_ON(!iommu))
3845                 return DMA_MAPPING_ERROR;
3846
3847         nrpages = aligned_nrpages(0, size);
3848         iova_pfn = intel_alloc_iova(dev, domain,
3849                                     dma_to_mm_pfn(nrpages), dma_mask);
3850         if (!iova_pfn)
3851                 return DMA_MAPPING_ERROR;
3852
3853         /*
3854          * Check if DMAR supports zero-length reads on write only
3855          * mappings..
3856          */
3857         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3858                         !cap_zlr(iommu->cap))
3859                 prot |= DMA_PTE_READ;
3860         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3861                 prot |= DMA_PTE_WRITE;
3862
3863         /*
3864          * If both the physical buffer start address and size are
3865          * page aligned, we don't need to use a bounce page.
3866          */
3867         if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3868                 tlb_addr = swiotlb_tbl_map_single(dev,
3869                                 __phys_to_dma(dev, io_tlb_start),
3870                                 paddr, size, aligned_size, dir, attrs);
3871                 if (tlb_addr == DMA_MAPPING_ERROR) {
3872                         goto swiotlb_error;
3873                 } else {
3874                         /* Cleanup the padding area. */
3875                         void *padding_start = phys_to_virt(tlb_addr);
3876                         size_t padding_size = aligned_size;
3877
3878                         if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3879                             (dir == DMA_TO_DEVICE ||
3880                              dir == DMA_BIDIRECTIONAL)) {
3881                                 padding_start += size;
3882                                 padding_size -= size;
3883                         }
3884
3885                         memset(padding_start, 0, padding_size);
3886                 }
3887         } else {
3888                 tlb_addr = paddr;
3889         }
3890
3891         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3892                                  tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3893         if (ret)
3894                 goto mapping_error;
3895
3896         trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3897
3898         return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3899
3900 mapping_error:
3901         if (is_swiotlb_buffer(tlb_addr))
3902                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3903                                          aligned_size, dir, attrs);
3904 swiotlb_error:
3905         free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3906         dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3907                 size, (unsigned long long)paddr, dir);
3908
3909         return DMA_MAPPING_ERROR;
3910 }
3911
3912 static void
3913 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3914                     enum dma_data_direction dir, unsigned long attrs)
3915 {
3916         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3917         struct dmar_domain *domain;
3918         phys_addr_t tlb_addr;
3919
3920         domain = find_domain(dev);
3921         if (WARN_ON(!domain))
3922                 return;
3923
3924         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3925         if (WARN_ON(!tlb_addr))
3926                 return;
3927
3928         intel_unmap(dev, dev_addr, size);
3929         if (is_swiotlb_buffer(tlb_addr))
3930                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3931                                          aligned_size, dir, attrs);
3932
3933         trace_bounce_unmap_single(dev, dev_addr, size);
3934 }
3935
3936 static dma_addr_t
3937 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3938                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3939 {
3940         return bounce_map_single(dev, page_to_phys(page) + offset,
3941                                  size, dir, attrs, *dev->dma_mask);
3942 }
3943
3944 static dma_addr_t
3945 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3946                     enum dma_data_direction dir, unsigned long attrs)
3947 {
3948         return bounce_map_single(dev, phys_addr, size,
3949                                  dir, attrs, *dev->dma_mask);
3950 }
3951
3952 static void
3953 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3954                   enum dma_data_direction dir, unsigned long attrs)
3955 {
3956         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3957 }
3958
3959 static void
3960 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3961                       enum dma_data_direction dir, unsigned long attrs)
3962 {
3963         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3964 }
3965
3966 static void
3967 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3968                 enum dma_data_direction dir, unsigned long attrs)
3969 {
3970         struct scatterlist *sg;
3971         int i;
3972
3973         for_each_sg(sglist, sg, nelems, i)
3974                 bounce_unmap_page(dev, sg->dma_address,
3975                                   sg_dma_len(sg), dir, attrs);
3976 }
3977
3978 static int
3979 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3980               enum dma_data_direction dir, unsigned long attrs)
3981 {
3982         int i;
3983         struct scatterlist *sg;
3984
3985         for_each_sg(sglist, sg, nelems, i) {
3986                 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3987                                                   sg->offset, sg->length,
3988                                                   dir, attrs);
3989                 if (sg->dma_address == DMA_MAPPING_ERROR)
3990                         goto out_unmap;
3991                 sg_dma_len(sg) = sg->length;
3992         }
3993
3994         return nelems;
3995
3996 out_unmap:
3997         bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3998         return 0;
3999 }
4000
4001 static void
4002 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
4003                            size_t size, enum dma_data_direction dir)
4004 {
4005         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
4006 }
4007
4008 static void
4009 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4010                               size_t size, enum dma_data_direction dir)
4011 {
4012         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4013 }
4014
4015 static void
4016 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4017                        int nelems, enum dma_data_direction dir)
4018 {
4019         struct scatterlist *sg;
4020         int i;
4021
4022         for_each_sg(sglist, sg, nelems, i)
4023                 bounce_sync_single(dev, sg_dma_address(sg),
4024                                    sg_dma_len(sg), dir, SYNC_FOR_CPU);
4025 }
4026
4027 static void
4028 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4029                           int nelems, enum dma_data_direction dir)
4030 {
4031         struct scatterlist *sg;
4032         int i;
4033
4034         for_each_sg(sglist, sg, nelems, i)
4035                 bounce_sync_single(dev, sg_dma_address(sg),
4036                                    sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4037 }
4038
4039 static const struct dma_map_ops bounce_dma_ops = {
4040         .alloc                  = intel_alloc_coherent,
4041         .free                   = intel_free_coherent,
4042         .map_sg                 = bounce_map_sg,
4043         .unmap_sg               = bounce_unmap_sg,
4044         .map_page               = bounce_map_page,
4045         .unmap_page             = bounce_unmap_page,
4046         .sync_single_for_cpu    = bounce_sync_single_for_cpu,
4047         .sync_single_for_device = bounce_sync_single_for_device,
4048         .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
4049         .sync_sg_for_device     = bounce_sync_sg_for_device,
4050         .map_resource           = bounce_map_resource,
4051         .unmap_resource         = bounce_unmap_resource,
4052         .dma_supported          = dma_direct_supported,
4053 };
4054
4055 static inline int iommu_domain_cache_init(void)
4056 {
4057         int ret = 0;
4058
4059         iommu_domain_cache = kmem_cache_create("iommu_domain",
4060                                          sizeof(struct dmar_domain),
4061                                          0,
4062                                          SLAB_HWCACHE_ALIGN,
4063
4064                                          NULL);
4065         if (!iommu_domain_cache) {
4066                 pr_err("Couldn't create iommu_domain cache\n");
4067                 ret = -ENOMEM;
4068         }
4069
4070         return ret;
4071 }
4072
4073 static inline int iommu_devinfo_cache_init(void)
4074 {
4075         int ret = 0;
4076
4077         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4078                                          sizeof(struct device_domain_info),
4079                                          0,
4080                                          SLAB_HWCACHE_ALIGN,
4081                                          NULL);
4082         if (!iommu_devinfo_cache) {
4083                 pr_err("Couldn't create devinfo cache\n");
4084                 ret = -ENOMEM;
4085         }
4086
4087         return ret;
4088 }
4089
4090 static int __init iommu_init_mempool(void)
4091 {
4092         int ret;
4093         ret = iova_cache_get();
4094         if (ret)
4095                 return ret;
4096
4097         ret = iommu_domain_cache_init();
4098         if (ret)
4099                 goto domain_error;
4100
4101         ret = iommu_devinfo_cache_init();
4102         if (!ret)
4103                 return ret;
4104
4105         kmem_cache_destroy(iommu_domain_cache);
4106 domain_error:
4107         iova_cache_put();
4108
4109         return -ENOMEM;
4110 }
4111
4112 static void __init iommu_exit_mempool(void)
4113 {
4114         kmem_cache_destroy(iommu_devinfo_cache);
4115         kmem_cache_destroy(iommu_domain_cache);
4116         iova_cache_put();
4117 }
4118
4119 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4120 {
4121         struct dmar_drhd_unit *drhd;
4122         u32 vtbar;
4123         int rc;
4124
4125         /* We know that this device on this chipset has its own IOMMU.
4126          * If we find it under a different IOMMU, then the BIOS is lying
4127          * to us. Hope that the IOMMU for this device is actually
4128          * disabled, and it needs no translation...
4129          */
4130         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4131         if (rc) {
4132                 /* "can't" happen */
4133                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4134                 return;
4135         }
4136         vtbar &= 0xffff0000;
4137
4138         /* we know that the this iommu should be at offset 0xa000 from vtbar */
4139         drhd = dmar_find_matched_drhd_unit(pdev);
4140         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
4141                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
4142                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4143                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4144         }
4145 }
4146 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4147
4148 static void __init init_no_remapping_devices(void)
4149 {
4150         struct dmar_drhd_unit *drhd;
4151         struct device *dev;
4152         int i;
4153
4154         for_each_drhd_unit(drhd) {
4155                 if (!drhd->include_all) {
4156                         for_each_active_dev_scope(drhd->devices,
4157                                                   drhd->devices_cnt, i, dev)
4158                                 break;
4159                         /* ignore DMAR unit if no devices exist */
4160                         if (i == drhd->devices_cnt)
4161                                 drhd->ignored = 1;
4162                 }
4163         }
4164
4165         for_each_active_drhd_unit(drhd) {
4166                 if (drhd->include_all)
4167                         continue;
4168
4169                 for_each_active_dev_scope(drhd->devices,
4170                                           drhd->devices_cnt, i, dev)
4171                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4172                                 break;
4173                 if (i < drhd->devices_cnt)
4174                         continue;
4175
4176                 /* This IOMMU has *only* gfx devices. Either bypass it or
4177                    set the gfx_mapped flag, as appropriate */
4178                 if (!dmar_map_gfx) {
4179                         drhd->ignored = 1;
4180                         for_each_active_dev_scope(drhd->devices,
4181                                                   drhd->devices_cnt, i, dev)
4182                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4183                 }
4184         }
4185 }
4186
4187 #ifdef CONFIG_SUSPEND
4188 static int init_iommu_hw(void)
4189 {
4190         struct dmar_drhd_unit *drhd;
4191         struct intel_iommu *iommu = NULL;
4192
4193         for_each_active_iommu(iommu, drhd)
4194                 if (iommu->qi)
4195                         dmar_reenable_qi(iommu);
4196
4197         for_each_iommu(iommu, drhd) {
4198                 if (drhd->ignored) {
4199                         /*
4200                          * we always have to disable PMRs or DMA may fail on
4201                          * this device
4202                          */
4203                         if (force_on)
4204                                 iommu_disable_protect_mem_regions(iommu);
4205                         continue;
4206                 }
4207
4208                 iommu_flush_write_buffer(iommu);
4209
4210                 iommu_set_root_entry(iommu);
4211
4212                 iommu->flush.flush_context(iommu, 0, 0, 0,
4213                                            DMA_CCMD_GLOBAL_INVL);
4214                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4215                 iommu_enable_translation(iommu);
4216                 iommu_disable_protect_mem_regions(iommu);
4217         }
4218
4219         return 0;
4220 }
4221
4222 static void iommu_flush_all(void)
4223 {
4224         struct dmar_drhd_unit *drhd;
4225         struct intel_iommu *iommu;
4226
4227         for_each_active_iommu(iommu, drhd) {
4228                 iommu->flush.flush_context(iommu, 0, 0, 0,
4229                                            DMA_CCMD_GLOBAL_INVL);
4230                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4231                                          DMA_TLB_GLOBAL_FLUSH);
4232         }
4233 }
4234
4235 static int iommu_suspend(void)
4236 {
4237         struct dmar_drhd_unit *drhd;
4238         struct intel_iommu *iommu = NULL;
4239         unsigned long flag;
4240
4241         for_each_active_iommu(iommu, drhd) {
4242                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4243                                                  GFP_ATOMIC);
4244                 if (!iommu->iommu_state)
4245                         goto nomem;
4246         }
4247
4248         iommu_flush_all();
4249
4250         for_each_active_iommu(iommu, drhd) {
4251                 iommu_disable_translation(iommu);
4252
4253                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4254
4255                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4256                         readl(iommu->reg + DMAR_FECTL_REG);
4257                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4258                         readl(iommu->reg + DMAR_FEDATA_REG);
4259                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4260                         readl(iommu->reg + DMAR_FEADDR_REG);
4261                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4262                         readl(iommu->reg + DMAR_FEUADDR_REG);
4263
4264                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4265         }
4266         return 0;
4267
4268 nomem:
4269         for_each_active_iommu(iommu, drhd)
4270                 kfree(iommu->iommu_state);
4271
4272         return -ENOMEM;
4273 }
4274
4275 static void iommu_resume(void)
4276 {
4277         struct dmar_drhd_unit *drhd;
4278         struct intel_iommu *iommu = NULL;
4279         unsigned long flag;
4280
4281         if (init_iommu_hw()) {
4282                 if (force_on)
4283                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4284                 else
4285                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4286                 return;
4287         }
4288
4289         for_each_active_iommu(iommu, drhd) {
4290
4291                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4292
4293                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4294                         iommu->reg + DMAR_FECTL_REG);
4295                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4296                         iommu->reg + DMAR_FEDATA_REG);
4297                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4298                         iommu->reg + DMAR_FEADDR_REG);
4299                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4300                         iommu->reg + DMAR_FEUADDR_REG);
4301
4302                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4303         }
4304
4305         for_each_active_iommu(iommu, drhd)
4306                 kfree(iommu->iommu_state);
4307 }
4308
4309 static struct syscore_ops iommu_syscore_ops = {
4310         .resume         = iommu_resume,
4311         .suspend        = iommu_suspend,
4312 };
4313
4314 static void __init init_iommu_pm_ops(void)
4315 {
4316         register_syscore_ops(&iommu_syscore_ops);
4317 }
4318
4319 #else
4320 static inline void init_iommu_pm_ops(void) {}
4321 #endif  /* CONFIG_PM */
4322
4323 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4324 {
4325         struct acpi_dmar_reserved_memory *rmrr;
4326         struct dmar_rmrr_unit *rmrru;
4327
4328         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4329         if (!rmrru)
4330                 goto out;
4331
4332         rmrru->hdr = header;
4333         rmrr = (struct acpi_dmar_reserved_memory *)header;
4334         rmrru->base_address = rmrr->base_address;
4335         rmrru->end_address = rmrr->end_address;
4336
4337         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4338                                 ((void *)rmrr) + rmrr->header.length,
4339                                 &rmrru->devices_cnt);
4340         if (rmrru->devices_cnt && rmrru->devices == NULL)
4341                 goto free_rmrru;
4342
4343         list_add(&rmrru->list, &dmar_rmrr_units);
4344
4345         return 0;
4346 free_rmrru:
4347         kfree(rmrru);
4348 out:
4349         return -ENOMEM;
4350 }
4351
4352 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4353 {
4354         struct dmar_atsr_unit *atsru;
4355         struct acpi_dmar_atsr *tmp;
4356
4357         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4358                                 dmar_rcu_check()) {
4359                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4360                 if (atsr->segment != tmp->segment)
4361                         continue;
4362                 if (atsr->header.length != tmp->header.length)
4363                         continue;
4364                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4365                         return atsru;
4366         }
4367
4368         return NULL;
4369 }
4370
4371 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4372 {
4373         struct acpi_dmar_atsr *atsr;
4374         struct dmar_atsr_unit *atsru;
4375
4376         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4377                 return 0;
4378
4379         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4380         atsru = dmar_find_atsr(atsr);
4381         if (atsru)
4382                 return 0;
4383
4384         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4385         if (!atsru)
4386                 return -ENOMEM;
4387
4388         /*
4389          * If memory is allocated from slab by ACPI _DSM method, we need to
4390          * copy the memory content because the memory buffer will be freed
4391          * on return.
4392          */
4393         atsru->hdr = (void *)(atsru + 1);
4394         memcpy(atsru->hdr, hdr, hdr->length);
4395         atsru->include_all = atsr->flags & 0x1;
4396         if (!atsru->include_all) {
4397                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4398                                 (void *)atsr + atsr->header.length,
4399                                 &atsru->devices_cnt);
4400                 if (atsru->devices_cnt && atsru->devices == NULL) {
4401                         kfree(atsru);
4402                         return -ENOMEM;
4403                 }
4404         }
4405
4406         list_add_rcu(&atsru->list, &dmar_atsr_units);
4407
4408         return 0;
4409 }
4410
4411 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4412 {
4413         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4414         kfree(atsru);
4415 }
4416
4417 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4418 {
4419         struct acpi_dmar_atsr *atsr;
4420         struct dmar_atsr_unit *atsru;
4421
4422         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4423         atsru = dmar_find_atsr(atsr);
4424         if (atsru) {
4425                 list_del_rcu(&atsru->list);
4426                 synchronize_rcu();
4427                 intel_iommu_free_atsr(atsru);
4428         }
4429
4430         return 0;
4431 }
4432
4433 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4434 {
4435         int i;
4436         struct device *dev;
4437         struct acpi_dmar_atsr *atsr;
4438         struct dmar_atsr_unit *atsru;
4439
4440         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4441         atsru = dmar_find_atsr(atsr);
4442         if (!atsru)
4443                 return 0;
4444
4445         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4446                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4447                                           i, dev)
4448                         return -EBUSY;
4449         }
4450
4451         return 0;
4452 }
4453
4454 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4455 {
4456         int sp, ret;
4457         struct intel_iommu *iommu = dmaru->iommu;
4458
4459         if (g_iommus[iommu->seq_id])
4460                 return 0;
4461
4462         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4463                 pr_warn("%s: Doesn't support hardware pass through.\n",
4464                         iommu->name);
4465                 return -ENXIO;
4466         }
4467         if (!ecap_sc_support(iommu->ecap) &&
4468             domain_update_iommu_snooping(iommu)) {
4469                 pr_warn("%s: Doesn't support snooping.\n",
4470                         iommu->name);
4471                 return -ENXIO;
4472         }
4473         sp = domain_update_iommu_superpage(iommu) - 1;
4474         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4475                 pr_warn("%s: Doesn't support large page.\n",
4476                         iommu->name);
4477                 return -ENXIO;
4478         }
4479
4480         /*
4481          * Disable translation if already enabled prior to OS handover.
4482          */
4483         if (iommu->gcmd & DMA_GCMD_TE)
4484                 iommu_disable_translation(iommu);
4485
4486         g_iommus[iommu->seq_id] = iommu;
4487         ret = iommu_init_domains(iommu);
4488         if (ret == 0)
4489                 ret = iommu_alloc_root_entry(iommu);
4490         if (ret)
4491                 goto out;
4492
4493 #ifdef CONFIG_INTEL_IOMMU_SVM
4494         if (pasid_supported(iommu))
4495                 intel_svm_init(iommu);
4496 #endif
4497
4498         if (dmaru->ignored) {
4499                 /*
4500                  * we always have to disable PMRs or DMA may fail on this device
4501                  */
4502                 if (force_on)
4503                         iommu_disable_protect_mem_regions(iommu);
4504                 return 0;
4505         }
4506
4507         intel_iommu_init_qi(iommu);
4508         iommu_flush_write_buffer(iommu);
4509
4510 #ifdef CONFIG_INTEL_IOMMU_SVM
4511         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4512                 ret = intel_svm_enable_prq(iommu);
4513                 if (ret)
4514                         goto disable_iommu;
4515         }
4516 #endif
4517         ret = dmar_set_interrupt(iommu);
4518         if (ret)
4519                 goto disable_iommu;
4520
4521         iommu_set_root_entry(iommu);
4522         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4523         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4524         iommu_enable_translation(iommu);
4525
4526         iommu_disable_protect_mem_regions(iommu);
4527         return 0;
4528
4529 disable_iommu:
4530         disable_dmar_iommu(iommu);
4531 out:
4532         free_dmar_iommu(iommu);
4533         return ret;
4534 }
4535
4536 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4537 {
4538         int ret = 0;
4539         struct intel_iommu *iommu = dmaru->iommu;
4540
4541         if (!intel_iommu_enabled)
4542                 return 0;
4543         if (iommu == NULL)
4544                 return -EINVAL;
4545
4546         if (insert) {
4547                 ret = intel_iommu_add(dmaru);
4548         } else {
4549                 disable_dmar_iommu(iommu);
4550                 free_dmar_iommu(iommu);
4551         }
4552
4553         return ret;
4554 }
4555
4556 static void intel_iommu_free_dmars(void)
4557 {
4558         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4559         struct dmar_atsr_unit *atsru, *atsr_n;
4560
4561         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4562                 list_del(&rmrru->list);
4563                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4564                 kfree(rmrru);
4565         }
4566
4567         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4568                 list_del(&atsru->list);
4569                 intel_iommu_free_atsr(atsru);
4570         }
4571 }
4572
4573 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4574 {
4575         int i, ret = 1;
4576         struct pci_bus *bus;
4577         struct pci_dev *bridge = NULL;
4578         struct device *tmp;
4579         struct acpi_dmar_atsr *atsr;
4580         struct dmar_atsr_unit *atsru;
4581
4582         dev = pci_physfn(dev);
4583         for (bus = dev->bus; bus; bus = bus->parent) {
4584                 bridge = bus->self;
4585                 /* If it's an integrated device, allow ATS */
4586                 if (!bridge)
4587                         return 1;
4588                 /* Connected via non-PCIe: no ATS */
4589                 if (!pci_is_pcie(bridge) ||
4590                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4591                         return 0;
4592                 /* If we found the root port, look it up in the ATSR */
4593                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4594                         break;
4595         }
4596
4597         rcu_read_lock();
4598         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4599                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4600                 if (atsr->segment != pci_domain_nr(dev->bus))
4601                         continue;
4602
4603                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4604                         if (tmp == &bridge->dev)
4605                                 goto out;
4606
4607                 if (atsru->include_all)
4608                         goto out;
4609         }
4610         ret = 0;
4611 out:
4612         rcu_read_unlock();
4613
4614         return ret;
4615 }
4616
4617 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4618 {
4619         int ret;
4620         struct dmar_rmrr_unit *rmrru;
4621         struct dmar_atsr_unit *atsru;
4622         struct acpi_dmar_atsr *atsr;
4623         struct acpi_dmar_reserved_memory *rmrr;
4624
4625         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4626                 return 0;
4627
4628         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4629                 rmrr = container_of(rmrru->hdr,
4630                                     struct acpi_dmar_reserved_memory, header);
4631                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4632                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4633                                 ((void *)rmrr) + rmrr->header.length,
4634                                 rmrr->segment, rmrru->devices,
4635                                 rmrru->devices_cnt);
4636                         if (ret < 0)
4637                                 return ret;
4638                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4639                         dmar_remove_dev_scope(info, rmrr->segment,
4640                                 rmrru->devices, rmrru->devices_cnt);
4641                 }
4642         }
4643
4644         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4645                 if (atsru->include_all)
4646                         continue;
4647
4648                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4649                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4650                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4651                                         (void *)atsr + atsr->header.length,
4652                                         atsr->segment, atsru->devices,
4653                                         atsru->devices_cnt);
4654                         if (ret > 0)
4655                                 break;
4656                         else if (ret < 0)
4657                                 return ret;
4658                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4659                         if (dmar_remove_dev_scope(info, atsr->segment,
4660                                         atsru->devices, atsru->devices_cnt))
4661                                 break;
4662                 }
4663         }
4664
4665         return 0;
4666 }
4667
4668 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4669                                        unsigned long val, void *v)
4670 {
4671         struct memory_notify *mhp = v;
4672         unsigned long long start, end;
4673         unsigned long start_vpfn, last_vpfn;
4674
4675         switch (val) {
4676         case MEM_GOING_ONLINE:
4677                 start = mhp->start_pfn << PAGE_SHIFT;
4678                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4679                 if (iommu_domain_identity_map(si_domain, start, end)) {
4680                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4681                                 start, end);
4682                         return NOTIFY_BAD;
4683                 }
4684                 break;
4685
4686         case MEM_OFFLINE:
4687         case MEM_CANCEL_ONLINE:
4688                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4689                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4690                 while (start_vpfn <= last_vpfn) {
4691                         struct iova *iova;
4692                         struct dmar_drhd_unit *drhd;
4693                         struct intel_iommu *iommu;
4694                         struct page *freelist;
4695
4696                         iova = find_iova(&si_domain->iovad, start_vpfn);
4697                         if (iova == NULL) {
4698                                 pr_debug("Failed get IOVA for PFN %lx\n",
4699                                          start_vpfn);
4700                                 break;
4701                         }
4702
4703                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4704                                                      start_vpfn, last_vpfn);
4705                         if (iova == NULL) {
4706                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4707                                         start_vpfn, last_vpfn);
4708                                 return NOTIFY_BAD;
4709                         }
4710
4711                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4712                                                iova->pfn_hi);
4713
4714                         rcu_read_lock();
4715                         for_each_active_iommu(iommu, drhd)
4716                                 iommu_flush_iotlb_psi(iommu, si_domain,
4717                                         iova->pfn_lo, iova_size(iova),
4718                                         !freelist, 0);
4719                         rcu_read_unlock();
4720                         dma_free_pagelist(freelist);
4721
4722                         start_vpfn = iova->pfn_hi + 1;
4723                         free_iova_mem(iova);
4724                 }
4725                 break;
4726         }
4727
4728         return NOTIFY_OK;
4729 }
4730
4731 static struct notifier_block intel_iommu_memory_nb = {
4732         .notifier_call = intel_iommu_memory_notifier,
4733         .priority = 0
4734 };
4735
4736 static void free_all_cpu_cached_iovas(unsigned int cpu)
4737 {
4738         int i;
4739
4740         for (i = 0; i < g_num_of_iommus; i++) {
4741                 struct intel_iommu *iommu = g_iommus[i];
4742                 struct dmar_domain *domain;
4743                 int did;
4744
4745                 if (!iommu)
4746                         continue;
4747
4748                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4749                         domain = get_iommu_domain(iommu, (u16)did);
4750
4751                         if (!domain)
4752                                 continue;
4753                         free_cpu_cached_iovas(cpu, &domain->iovad);
4754                 }
4755         }
4756 }
4757
4758 static int intel_iommu_cpu_dead(unsigned int cpu)
4759 {
4760         free_all_cpu_cached_iovas(cpu);
4761         return 0;
4762 }
4763
4764 static void intel_disable_iommus(void)
4765 {
4766         struct intel_iommu *iommu = NULL;
4767         struct dmar_drhd_unit *drhd;
4768
4769         for_each_iommu(iommu, drhd)
4770                 iommu_disable_translation(iommu);
4771 }
4772
4773 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4774 {
4775         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4776
4777         return container_of(iommu_dev, struct intel_iommu, iommu);
4778 }
4779
4780 static ssize_t intel_iommu_show_version(struct device *dev,
4781                                         struct device_attribute *attr,
4782                                         char *buf)
4783 {
4784         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4785         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4786         return sprintf(buf, "%d:%d\n",
4787                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4788 }
4789 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4790
4791 static ssize_t intel_iommu_show_address(struct device *dev,
4792                                         struct device_attribute *attr,
4793                                         char *buf)
4794 {
4795         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4796         return sprintf(buf, "%llx\n", iommu->reg_phys);
4797 }
4798 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4799
4800 static ssize_t intel_iommu_show_cap(struct device *dev,
4801                                     struct device_attribute *attr,
4802                                     char *buf)
4803 {
4804         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4805         return sprintf(buf, "%llx\n", iommu->cap);
4806 }
4807 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4808
4809 static ssize_t intel_iommu_show_ecap(struct device *dev,
4810                                     struct device_attribute *attr,
4811                                     char *buf)
4812 {
4813         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4814         return sprintf(buf, "%llx\n", iommu->ecap);
4815 }
4816 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4817
4818 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4819                                       struct device_attribute *attr,
4820                                       char *buf)
4821 {
4822         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4823         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4824 }
4825 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4826
4827 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4828                                            struct device_attribute *attr,
4829                                            char *buf)
4830 {
4831         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4832         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4833                                                   cap_ndoms(iommu->cap)));
4834 }
4835 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4836
4837 static struct attribute *intel_iommu_attrs[] = {
4838         &dev_attr_version.attr,
4839         &dev_attr_address.attr,
4840         &dev_attr_cap.attr,
4841         &dev_attr_ecap.attr,
4842         &dev_attr_domains_supported.attr,
4843         &dev_attr_domains_used.attr,
4844         NULL,
4845 };
4846
4847 static struct attribute_group intel_iommu_group = {
4848         .name = "intel-iommu",
4849         .attrs = intel_iommu_attrs,
4850 };
4851
4852 const struct attribute_group *intel_iommu_groups[] = {
4853         &intel_iommu_group,
4854         NULL,
4855 };
4856
4857 static inline bool has_untrusted_dev(void)
4858 {
4859         struct pci_dev *pdev = NULL;
4860
4861         for_each_pci_dev(pdev)
4862                 if (pdev->untrusted)
4863                         return true;
4864
4865         return false;
4866 }
4867
4868 static int __init platform_optin_force_iommu(void)
4869 {
4870         if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
4871                 return 0;
4872
4873         if (no_iommu || dmar_disabled)
4874                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4875
4876         /*
4877          * If Intel-IOMMU is disabled by default, we will apply identity
4878          * map for all devices except those marked as being untrusted.
4879          */
4880         if (dmar_disabled)
4881                 iommu_identity_mapping |= IDENTMAP_ALL;
4882
4883         dmar_disabled = 0;
4884         no_iommu = 0;
4885
4886         return 1;
4887 }
4888
4889 static int __init probe_acpi_namespace_devices(void)
4890 {
4891         struct dmar_drhd_unit *drhd;
4892         /* To avoid a -Wunused-but-set-variable warning. */
4893         struct intel_iommu *iommu __maybe_unused;
4894         struct device *dev;
4895         int i, ret = 0;
4896
4897         for_each_active_iommu(iommu, drhd) {
4898                 for_each_active_dev_scope(drhd->devices,
4899                                           drhd->devices_cnt, i, dev) {
4900                         struct acpi_device_physical_node *pn;
4901                         struct iommu_group *group;
4902                         struct acpi_device *adev;
4903
4904                         if (dev->bus != &acpi_bus_type)
4905                                 continue;
4906
4907                         adev = to_acpi_device(dev);
4908                         mutex_lock(&adev->physical_node_lock);
4909                         list_for_each_entry(pn,
4910                                             &adev->physical_node_list, node) {
4911                                 group = iommu_group_get(pn->dev);
4912                                 if (group) {
4913                                         iommu_group_put(group);
4914                                         continue;
4915                                 }
4916
4917                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4918                                 ret = iommu_probe_device(pn->dev);
4919                                 if (ret)
4920                                         break;
4921                         }
4922                         mutex_unlock(&adev->physical_node_lock);
4923
4924                         if (ret)
4925                                 return ret;
4926                 }
4927         }
4928
4929         return 0;
4930 }
4931
4932 int __init intel_iommu_init(void)
4933 {
4934         int ret = -ENODEV;
4935         struct dmar_drhd_unit *drhd;
4936         struct intel_iommu *iommu;
4937
4938         /*
4939          * Intel IOMMU is required for a TXT/tboot launch or platform
4940          * opt in, so enforce that.
4941          */
4942         force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4943                     platform_optin_force_iommu();
4944
4945         if (iommu_init_mempool()) {
4946                 if (force_on)
4947                         panic("tboot: Failed to initialize iommu memory\n");
4948                 return -ENOMEM;
4949         }
4950
4951         down_write(&dmar_global_lock);
4952         if (dmar_table_init()) {
4953                 if (force_on)
4954                         panic("tboot: Failed to initialize DMAR table\n");
4955                 goto out_free_dmar;
4956         }
4957
4958         if (dmar_dev_scope_init() < 0) {
4959                 if (force_on)
4960                         panic("tboot: Failed to initialize DMAR device scope\n");
4961                 goto out_free_dmar;
4962         }
4963
4964         up_write(&dmar_global_lock);
4965
4966         /*
4967          * The bus notifier takes the dmar_global_lock, so lockdep will
4968          * complain later when we register it under the lock.
4969          */
4970         dmar_register_bus_notifier();
4971
4972         down_write(&dmar_global_lock);
4973
4974         if (!no_iommu)
4975                 intel_iommu_debugfs_init();
4976
4977         if (no_iommu || dmar_disabled) {
4978                 /*
4979                  * We exit the function here to ensure IOMMU's remapping and
4980                  * mempool aren't setup, which means that the IOMMU's PMRs
4981                  * won't be disabled via the call to init_dmars(). So disable
4982                  * it explicitly here. The PMRs were setup by tboot prior to
4983                  * calling SENTER, but the kernel is expected to reset/tear
4984                  * down the PMRs.
4985                  */
4986                 if (intel_iommu_tboot_noforce) {
4987                         for_each_iommu(iommu, drhd)
4988                                 iommu_disable_protect_mem_regions(iommu);
4989                 }
4990
4991                 /*
4992                  * Make sure the IOMMUs are switched off, even when we
4993                  * boot into a kexec kernel and the previous kernel left
4994                  * them enabled
4995                  */
4996                 intel_disable_iommus();
4997                 goto out_free_dmar;
4998         }
4999
5000         if (list_empty(&dmar_rmrr_units))
5001                 pr_info("No RMRR found\n");
5002
5003         if (list_empty(&dmar_atsr_units))
5004                 pr_info("No ATSR found\n");
5005
5006         if (dmar_init_reserved_ranges()) {
5007                 if (force_on)
5008                         panic("tboot: Failed to reserve iommu ranges\n");
5009                 goto out_free_reserved_range;
5010         }
5011
5012         if (dmar_map_gfx)
5013                 intel_iommu_gfx_mapped = 1;
5014
5015         init_no_remapping_devices();
5016
5017         ret = init_dmars();
5018         if (ret) {
5019                 if (force_on)
5020                         panic("tboot: Failed to initialize DMARs\n");
5021                 pr_err("Initialization failed\n");
5022                 goto out_free_reserved_range;
5023         }
5024         up_write(&dmar_global_lock);
5025
5026 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
5027         /*
5028          * If the system has no untrusted device or the user has decided
5029          * to disable the bounce page mechanisms, we don't need swiotlb.
5030          * Mark this and the pre-allocated bounce pages will be released
5031          * later.
5032          */
5033         if (!has_untrusted_dev() || intel_no_bounce)
5034                 swiotlb = 0;
5035 #endif
5036         dma_ops = &intel_dma_ops;
5037
5038         init_iommu_pm_ops();
5039
5040         down_read(&dmar_global_lock);
5041         for_each_active_iommu(iommu, drhd) {
5042                 iommu_device_sysfs_add(&iommu->iommu, NULL,
5043                                        intel_iommu_groups,
5044                                        "%s", iommu->name);
5045                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5046                 iommu_device_register(&iommu->iommu);
5047         }
5048         up_read(&dmar_global_lock);
5049
5050         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5051         if (si_domain && !hw_pass_through)
5052                 register_memory_notifier(&intel_iommu_memory_nb);
5053         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5054                           intel_iommu_cpu_dead);
5055
5056         down_read(&dmar_global_lock);
5057         if (probe_acpi_namespace_devices())
5058                 pr_warn("ACPI name space devices didn't probe correctly\n");
5059
5060         /* Finally, we enable the DMA remapping hardware. */
5061         for_each_iommu(iommu, drhd) {
5062                 if (!drhd->ignored && !translation_pre_enabled(iommu))
5063                         iommu_enable_translation(iommu);
5064
5065                 iommu_disable_protect_mem_regions(iommu);
5066         }
5067         up_read(&dmar_global_lock);
5068
5069         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5070
5071         intel_iommu_enabled = 1;
5072
5073         return 0;
5074
5075 out_free_reserved_range:
5076         put_iova_domain(&reserved_iova_list);
5077 out_free_dmar:
5078         intel_iommu_free_dmars();
5079         up_write(&dmar_global_lock);
5080         iommu_exit_mempool();
5081         return ret;
5082 }
5083
5084 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5085 {
5086         struct intel_iommu *iommu = opaque;
5087
5088         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5089         return 0;
5090 }
5091
5092 /*
5093  * NB - intel-iommu lacks any sort of reference counting for the users of
5094  * dependent devices.  If multiple endpoints have intersecting dependent
5095  * devices, unbinding the driver from any one of them will possibly leave
5096  * the others unable to operate.
5097  */
5098 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5099 {
5100         if (!iommu || !dev || !dev_is_pci(dev))
5101                 return;
5102
5103         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5104 }
5105
5106 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5107 {
5108         struct dmar_domain *domain;
5109         struct intel_iommu *iommu;
5110         unsigned long flags;
5111
5112         assert_spin_locked(&device_domain_lock);
5113
5114         if (WARN_ON(!info))
5115                 return;
5116
5117         iommu = info->iommu;
5118         domain = info->domain;
5119
5120         if (info->dev) {
5121                 if (dev_is_pci(info->dev) && sm_supported(iommu))
5122                         intel_pasid_tear_down_entry(iommu, info->dev,
5123                                         PASID_RID2PASID);
5124
5125                 iommu_disable_dev_iotlb(info);
5126                 domain_context_clear(iommu, info->dev);
5127                 intel_pasid_free_table(info->dev);
5128         }
5129
5130         unlink_domain_info(info);
5131
5132         spin_lock_irqsave(&iommu->lock, flags);
5133         domain_detach_iommu(domain, iommu);
5134         spin_unlock_irqrestore(&iommu->lock, flags);
5135
5136         /* free the private domain */
5137         if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
5138             !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
5139             list_empty(&domain->devices))
5140                 domain_exit(info->domain);
5141
5142         free_devinfo_mem(info);
5143 }
5144
5145 static void dmar_remove_one_dev_info(struct device *dev)
5146 {
5147         struct device_domain_info *info;
5148         unsigned long flags;
5149
5150         spin_lock_irqsave(&device_domain_lock, flags);
5151         info = dev->archdata.iommu;
5152         if (info && info != DEFER_DEVICE_DOMAIN_INFO
5153             && info != DUMMY_DEVICE_DOMAIN_INFO)
5154                 __dmar_remove_one_dev_info(info);
5155         spin_unlock_irqrestore(&device_domain_lock, flags);
5156 }
5157
5158 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5159 {
5160         int adjust_width;
5161
5162         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5163         domain_reserve_special_ranges(domain);
5164
5165         /* calculate AGAW */
5166         domain->gaw = guest_width;
5167         adjust_width = guestwidth_to_adjustwidth(guest_width);
5168         domain->agaw = width_to_agaw(adjust_width);
5169
5170         domain->iommu_coherency = 0;
5171         domain->iommu_snooping = 0;
5172         domain->iommu_superpage = 0;
5173         domain->max_addr = 0;
5174
5175         /* always allocate the top pgd */
5176         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5177         if (!domain->pgd)
5178                 return -ENOMEM;
5179         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5180         return 0;
5181 }
5182
5183 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5184 {
5185         struct dmar_domain *dmar_domain;
5186         struct iommu_domain *domain;
5187
5188         switch (type) {
5189         case IOMMU_DOMAIN_DMA:
5190         /* fallthrough */
5191         case IOMMU_DOMAIN_UNMANAGED:
5192                 dmar_domain = alloc_domain(0);
5193                 if (!dmar_domain) {
5194                         pr_err("Can't allocate dmar_domain\n");
5195                         return NULL;
5196                 }
5197                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5198                         pr_err("Domain initialization failed\n");
5199                         domain_exit(dmar_domain);
5200                         return NULL;
5201                 }
5202
5203                 if (type == IOMMU_DOMAIN_DMA &&
5204                     init_iova_flush_queue(&dmar_domain->iovad,
5205                                           iommu_flush_iova, iova_entry_free)) {
5206                         pr_warn("iova flush queue initialization failed\n");
5207                         intel_iommu_strict = 1;
5208                 }
5209
5210                 domain_update_iommu_cap(dmar_domain);
5211
5212                 domain = &dmar_domain->domain;
5213                 domain->geometry.aperture_start = 0;
5214                 domain->geometry.aperture_end   =
5215                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5216                 domain->geometry.force_aperture = true;
5217
5218                 return domain;
5219         case IOMMU_DOMAIN_IDENTITY:
5220                 return &si_domain->domain;
5221         default:
5222                 return NULL;
5223         }
5224
5225         return NULL;
5226 }
5227
5228 static void intel_iommu_domain_free(struct iommu_domain *domain)
5229 {
5230         if (domain != &si_domain->domain)
5231                 domain_exit(to_dmar_domain(domain));
5232 }
5233
5234 /*
5235  * Check whether a @domain could be attached to the @dev through the
5236  * aux-domain attach/detach APIs.
5237  */
5238 static inline bool
5239 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5240 {
5241         struct device_domain_info *info = dev->archdata.iommu;
5242
5243         return info && info->auxd_enabled &&
5244                         domain->type == IOMMU_DOMAIN_UNMANAGED;
5245 }
5246
5247 static void auxiliary_link_device(struct dmar_domain *domain,
5248                                   struct device *dev)
5249 {
5250         struct device_domain_info *info = dev->archdata.iommu;
5251
5252         assert_spin_locked(&device_domain_lock);
5253         if (WARN_ON(!info))
5254                 return;
5255
5256         domain->auxd_refcnt++;
5257         list_add(&domain->auxd, &info->auxiliary_domains);
5258 }
5259
5260 static void auxiliary_unlink_device(struct dmar_domain *domain,
5261                                     struct device *dev)
5262 {
5263         struct device_domain_info *info = dev->archdata.iommu;
5264
5265         assert_spin_locked(&device_domain_lock);
5266         if (WARN_ON(!info))
5267                 return;
5268
5269         list_del(&domain->auxd);
5270         domain->auxd_refcnt--;
5271
5272         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5273                 intel_pasid_free_id(domain->default_pasid);
5274 }
5275
5276 static int aux_domain_add_dev(struct dmar_domain *domain,
5277                               struct device *dev)
5278 {
5279         int ret;
5280         u8 bus, devfn;
5281         unsigned long flags;
5282         struct intel_iommu *iommu;
5283
5284         iommu = device_to_iommu(dev, &bus, &devfn);
5285         if (!iommu)
5286                 return -ENODEV;
5287
5288         if (domain->default_pasid <= 0) {
5289                 int pasid;
5290
5291                 pasid = intel_pasid_alloc_id(domain, PASID_MIN,
5292                                              pci_max_pasids(to_pci_dev(dev)),
5293                                              GFP_KERNEL);
5294                 if (pasid <= 0) {
5295                         pr_err("Can't allocate default pasid\n");
5296                         return -ENODEV;
5297                 }
5298                 domain->default_pasid = pasid;
5299         }
5300
5301         spin_lock_irqsave(&device_domain_lock, flags);
5302         /*
5303          * iommu->lock must be held to attach domain to iommu and setup the
5304          * pasid entry for second level translation.
5305          */
5306         spin_lock(&iommu->lock);
5307         ret = domain_attach_iommu(domain, iommu);
5308         if (ret)
5309                 goto attach_failed;
5310
5311         /* Setup the PASID entry for mediated devices: */
5312         ret = intel_pasid_setup_second_level(iommu, domain, dev,
5313                                              domain->default_pasid);
5314         if (ret)
5315                 goto table_failed;
5316         spin_unlock(&iommu->lock);
5317
5318         auxiliary_link_device(domain, dev);
5319
5320         spin_unlock_irqrestore(&device_domain_lock, flags);
5321
5322         return 0;
5323
5324 table_failed:
5325         domain_detach_iommu(domain, iommu);
5326 attach_failed:
5327         spin_unlock(&iommu->lock);
5328         spin_unlock_irqrestore(&device_domain_lock, flags);
5329         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5330                 intel_pasid_free_id(domain->default_pasid);
5331
5332         return ret;
5333 }
5334
5335 static void aux_domain_remove_dev(struct dmar_domain *domain,
5336                                   struct device *dev)
5337 {
5338         struct device_domain_info *info;
5339         struct intel_iommu *iommu;
5340         unsigned long flags;
5341
5342         if (!is_aux_domain(dev, &domain->domain))
5343                 return;
5344
5345         spin_lock_irqsave(&device_domain_lock, flags);
5346         info = dev->archdata.iommu;
5347         iommu = info->iommu;
5348
5349         auxiliary_unlink_device(domain, dev);
5350
5351         spin_lock(&iommu->lock);
5352         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5353         domain_detach_iommu(domain, iommu);
5354         spin_unlock(&iommu->lock);
5355
5356         spin_unlock_irqrestore(&device_domain_lock, flags);
5357 }
5358
5359 static int prepare_domain_attach_device(struct iommu_domain *domain,
5360                                         struct device *dev)
5361 {
5362         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5363         struct intel_iommu *iommu;
5364         int addr_width;
5365         u8 bus, devfn;
5366
5367         iommu = device_to_iommu(dev, &bus, &devfn);
5368         if (!iommu)
5369                 return -ENODEV;
5370
5371         /* check if this iommu agaw is sufficient for max mapped address */
5372         addr_width = agaw_to_width(iommu->agaw);
5373         if (addr_width > cap_mgaw(iommu->cap))
5374                 addr_width = cap_mgaw(iommu->cap);
5375
5376         if (dmar_domain->max_addr > (1LL << addr_width)) {
5377                 dev_err(dev, "%s: iommu width (%d) is not "
5378                         "sufficient for the mapped address (%llx)\n",
5379                         __func__, addr_width, dmar_domain->max_addr);
5380                 return -EFAULT;
5381         }
5382         dmar_domain->gaw = addr_width;
5383
5384         /*
5385          * Knock out extra levels of page tables if necessary
5386          */
5387         while (iommu->agaw < dmar_domain->agaw) {
5388                 struct dma_pte *pte;
5389
5390                 pte = dmar_domain->pgd;
5391                 if (dma_pte_present(pte)) {
5392                         dmar_domain->pgd = (struct dma_pte *)
5393                                 phys_to_virt(dma_pte_addr(pte));
5394                         free_pgtable_page(pte);
5395                 }
5396                 dmar_domain->agaw--;
5397         }
5398
5399         return 0;
5400 }
5401
5402 static int intel_iommu_attach_device(struct iommu_domain *domain,
5403                                      struct device *dev)
5404 {
5405         int ret;
5406
5407         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5408             device_is_rmrr_locked(dev)) {
5409                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5410                 return -EPERM;
5411         }
5412
5413         if (is_aux_domain(dev, domain))
5414                 return -EPERM;
5415
5416         /* normally dev is not mapped */
5417         if (unlikely(domain_context_mapped(dev))) {
5418                 struct dmar_domain *old_domain;
5419
5420                 old_domain = find_domain(dev);
5421                 if (old_domain)
5422                         dmar_remove_one_dev_info(dev);
5423         }
5424
5425         ret = prepare_domain_attach_device(domain, dev);
5426         if (ret)
5427                 return ret;
5428
5429         return domain_add_dev_info(to_dmar_domain(domain), dev);
5430 }
5431
5432 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5433                                          struct device *dev)
5434 {
5435         int ret;
5436
5437         if (!is_aux_domain(dev, domain))
5438                 return -EPERM;
5439
5440         ret = prepare_domain_attach_device(domain, dev);
5441         if (ret)
5442                 return ret;
5443
5444         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5445 }
5446
5447 static void intel_iommu_detach_device(struct iommu_domain *domain,
5448                                       struct device *dev)
5449 {
5450         dmar_remove_one_dev_info(dev);
5451 }
5452
5453 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5454                                           struct device *dev)
5455 {
5456         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5457 }
5458
5459 static int intel_iommu_map(struct iommu_domain *domain,
5460                            unsigned long iova, phys_addr_t hpa,
5461                            size_t size, int iommu_prot, gfp_t gfp)
5462 {
5463         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5464         u64 max_addr;
5465         int prot = 0;
5466         int ret;
5467
5468         if (iommu_prot & IOMMU_READ)
5469                 prot |= DMA_PTE_READ;
5470         if (iommu_prot & IOMMU_WRITE)
5471                 prot |= DMA_PTE_WRITE;
5472         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5473                 prot |= DMA_PTE_SNP;
5474
5475         max_addr = iova + size;
5476         if (dmar_domain->max_addr < max_addr) {
5477                 u64 end;
5478
5479                 /* check if minimum agaw is sufficient for mapped address */
5480                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5481                 if (end < max_addr) {
5482                         pr_err("%s: iommu width (%d) is not "
5483                                "sufficient for the mapped address (%llx)\n",
5484                                __func__, dmar_domain->gaw, max_addr);
5485                         return -EFAULT;
5486                 }
5487                 dmar_domain->max_addr = max_addr;
5488         }
5489         /* Round up size to next multiple of PAGE_SIZE, if it and
5490            the low bits of hpa would take us onto the next page */
5491         size = aligned_nrpages(hpa, size);
5492         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5493                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5494         return ret;
5495 }
5496
5497 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5498                                 unsigned long iova, size_t size,
5499                                 struct iommu_iotlb_gather *gather)
5500 {
5501         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5502         struct page *freelist = NULL;
5503         unsigned long start_pfn, last_pfn;
5504         unsigned int npages;
5505         int iommu_id, level = 0;
5506
5507         /* Cope with horrid API which requires us to unmap more than the
5508            size argument if it happens to be a large-page mapping. */
5509         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5510
5511         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5512                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5513
5514         start_pfn = iova >> VTD_PAGE_SHIFT;
5515         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5516
5517         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5518
5519         npages = last_pfn - start_pfn + 1;
5520
5521         for_each_domain_iommu(iommu_id, dmar_domain)
5522                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5523                                       start_pfn, npages, !freelist, 0);
5524
5525         dma_free_pagelist(freelist);
5526
5527         if (dmar_domain->max_addr == iova + size)
5528                 dmar_domain->max_addr = iova;
5529
5530         return size;
5531 }
5532
5533 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5534                                             dma_addr_t iova)
5535 {
5536         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5537         struct dma_pte *pte;
5538         int level = 0;
5539         u64 phys = 0;
5540
5541         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5542         if (pte && dma_pte_present(pte))
5543                 phys = dma_pte_addr(pte) +
5544                         (iova & (BIT_MASK(level_to_offset_bits(level) +
5545                                                 VTD_PAGE_SHIFT) - 1));
5546
5547         return phys;
5548 }
5549
5550 static inline bool scalable_mode_support(void)
5551 {
5552         struct dmar_drhd_unit *drhd;
5553         struct intel_iommu *iommu;
5554         bool ret = true;
5555
5556         rcu_read_lock();
5557         for_each_active_iommu(iommu, drhd) {
5558                 if (!sm_supported(iommu)) {
5559                         ret = false;
5560                         break;
5561                 }
5562         }
5563         rcu_read_unlock();
5564
5565         return ret;
5566 }
5567
5568 static inline bool iommu_pasid_support(void)
5569 {
5570         struct dmar_drhd_unit *drhd;
5571         struct intel_iommu *iommu;
5572         bool ret = true;
5573
5574         rcu_read_lock();
5575         for_each_active_iommu(iommu, drhd) {
5576                 if (!pasid_supported(iommu)) {
5577                         ret = false;
5578                         break;
5579                 }
5580         }
5581         rcu_read_unlock();
5582
5583         return ret;
5584 }
5585
5586 static bool intel_iommu_capable(enum iommu_cap cap)
5587 {
5588         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5589                 return domain_update_iommu_snooping(NULL) == 1;
5590         if (cap == IOMMU_CAP_INTR_REMAP)
5591                 return irq_remapping_enabled == 1;
5592
5593         return false;
5594 }
5595
5596 static int intel_iommu_add_device(struct device *dev)
5597 {
5598         struct dmar_domain *dmar_domain;
5599         struct iommu_domain *domain;
5600         struct intel_iommu *iommu;
5601         struct iommu_group *group;
5602         u8 bus, devfn;
5603         int ret;
5604
5605         iommu = device_to_iommu(dev, &bus, &devfn);
5606         if (!iommu)
5607                 return -ENODEV;
5608
5609         iommu_device_link(&iommu->iommu, dev);
5610
5611         if (translation_pre_enabled(iommu))
5612                 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5613
5614         group = iommu_group_get_for_dev(dev);
5615
5616         if (IS_ERR(group)) {
5617                 ret = PTR_ERR(group);
5618                 goto unlink;
5619         }
5620
5621         iommu_group_put(group);
5622
5623         domain = iommu_get_domain_for_dev(dev);
5624         dmar_domain = to_dmar_domain(domain);
5625         if (domain->type == IOMMU_DOMAIN_DMA) {
5626                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5627                         ret = iommu_request_dm_for_dev(dev);
5628                         if (ret) {
5629                                 dmar_remove_one_dev_info(dev);
5630                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5631                                 domain_add_dev_info(si_domain, dev);
5632                                 dev_info(dev,
5633                                          "Device uses a private identity domain.\n");
5634                         }
5635                 }
5636         } else {
5637                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5638                         ret = iommu_request_dma_domain_for_dev(dev);
5639                         if (ret) {
5640                                 dmar_remove_one_dev_info(dev);
5641                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5642                                 if (!get_private_domain_for_dev(dev)) {
5643                                         dev_warn(dev,
5644                                                  "Failed to get a private domain.\n");
5645                                         ret = -ENOMEM;
5646                                         goto unlink;
5647                                 }
5648
5649                                 dev_info(dev,
5650                                          "Device uses a private dma domain.\n");
5651                         }
5652                 }
5653         }
5654
5655         if (device_needs_bounce(dev)) {
5656                 dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
5657                 set_dma_ops(dev, &bounce_dma_ops);
5658         }
5659
5660         return 0;
5661
5662 unlink:
5663         iommu_device_unlink(&iommu->iommu, dev);
5664         return ret;
5665 }
5666
5667 static void intel_iommu_remove_device(struct device *dev)
5668 {
5669         struct intel_iommu *iommu;
5670         u8 bus, devfn;
5671
5672         iommu = device_to_iommu(dev, &bus, &devfn);
5673         if (!iommu)
5674                 return;
5675
5676         dmar_remove_one_dev_info(dev);
5677
5678         iommu_group_remove_device(dev);
5679
5680         iommu_device_unlink(&iommu->iommu, dev);
5681
5682         if (device_needs_bounce(dev))
5683                 set_dma_ops(dev, NULL);
5684 }
5685
5686 static void intel_iommu_get_resv_regions(struct device *device,
5687                                          struct list_head *head)
5688 {
5689         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5690         struct iommu_resv_region *reg;
5691         struct dmar_rmrr_unit *rmrr;
5692         struct device *i_dev;
5693         int i;
5694
5695         down_read(&dmar_global_lock);
5696         for_each_rmrr_units(rmrr) {
5697                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5698                                           i, i_dev) {
5699                         struct iommu_resv_region *resv;
5700                         enum iommu_resv_type type;
5701                         size_t length;
5702
5703                         if (i_dev != device &&
5704                             !is_downstream_to_pci_bridge(device, i_dev))
5705                                 continue;
5706
5707                         length = rmrr->end_address - rmrr->base_address + 1;
5708
5709                         type = device_rmrr_is_relaxable(device) ?
5710                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5711
5712                         resv = iommu_alloc_resv_region(rmrr->base_address,
5713                                                        length, prot, type);
5714                         if (!resv)
5715                                 break;
5716
5717                         list_add_tail(&resv->list, head);
5718                 }
5719         }
5720         up_read(&dmar_global_lock);
5721
5722 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5723         if (dev_is_pci(device)) {
5724                 struct pci_dev *pdev = to_pci_dev(device);
5725
5726                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5727                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5728                                                    IOMMU_RESV_DIRECT_RELAXABLE);
5729                         if (reg)
5730                                 list_add_tail(&reg->list, head);
5731                 }
5732         }
5733 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5734
5735         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5736                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5737                                       0, IOMMU_RESV_MSI);
5738         if (!reg)
5739                 return;
5740         list_add_tail(&reg->list, head);
5741 }
5742
5743 static void intel_iommu_put_resv_regions(struct device *dev,
5744                                          struct list_head *head)
5745 {
5746         struct iommu_resv_region *entry, *next;
5747
5748         list_for_each_entry_safe(entry, next, head, list)
5749                 kfree(entry);
5750 }
5751
5752 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5753 {
5754         struct device_domain_info *info;
5755         struct context_entry *context;
5756         struct dmar_domain *domain;
5757         unsigned long flags;
5758         u64 ctx_lo;
5759         int ret;
5760
5761         domain = find_domain(dev);
5762         if (!domain)
5763                 return -EINVAL;
5764
5765         spin_lock_irqsave(&device_domain_lock, flags);
5766         spin_lock(&iommu->lock);
5767
5768         ret = -EINVAL;
5769         info = dev->archdata.iommu;
5770         if (!info || !info->pasid_supported)
5771                 goto out;
5772
5773         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5774         if (WARN_ON(!context))
5775                 goto out;
5776
5777         ctx_lo = context[0].lo;
5778
5779         if (!(ctx_lo & CONTEXT_PASIDE)) {
5780                 ctx_lo |= CONTEXT_PASIDE;
5781                 context[0].lo = ctx_lo;
5782                 wmb();
5783                 iommu->flush.flush_context(iommu,
5784                                            domain->iommu_did[iommu->seq_id],
5785                                            PCI_DEVID(info->bus, info->devfn),
5786                                            DMA_CCMD_MASK_NOBIT,
5787                                            DMA_CCMD_DEVICE_INVL);
5788         }
5789
5790         /* Enable PASID support in the device, if it wasn't already */
5791         if (!info->pasid_enabled)
5792                 iommu_enable_dev_iotlb(info);
5793
5794         ret = 0;
5795
5796  out:
5797         spin_unlock(&iommu->lock);
5798         spin_unlock_irqrestore(&device_domain_lock, flags);
5799
5800         return ret;
5801 }
5802
5803 static void intel_iommu_apply_resv_region(struct device *dev,
5804                                           struct iommu_domain *domain,
5805                                           struct iommu_resv_region *region)
5806 {
5807         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5808         unsigned long start, end;
5809
5810         start = IOVA_PFN(region->start);
5811         end   = IOVA_PFN(region->start + region->length - 1);
5812
5813         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5814 }
5815
5816 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5817 {
5818         if (dev_is_pci(dev))
5819                 return pci_device_group(dev);
5820         return generic_device_group(dev);
5821 }
5822
5823 #ifdef CONFIG_INTEL_IOMMU_SVM
5824 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5825 {
5826         struct intel_iommu *iommu;
5827         u8 bus, devfn;
5828
5829         if (iommu_dummy(dev)) {
5830                 dev_warn(dev,
5831                          "No IOMMU translation for device; cannot enable SVM\n");
5832                 return NULL;
5833         }
5834
5835         iommu = device_to_iommu(dev, &bus, &devfn);
5836         if ((!iommu)) {
5837                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5838                 return NULL;
5839         }
5840
5841         return iommu;
5842 }
5843 #endif /* CONFIG_INTEL_IOMMU_SVM */
5844
5845 static int intel_iommu_enable_auxd(struct device *dev)
5846 {
5847         struct device_domain_info *info;
5848         struct intel_iommu *iommu;
5849         unsigned long flags;
5850         u8 bus, devfn;
5851         int ret;
5852
5853         iommu = device_to_iommu(dev, &bus, &devfn);
5854         if (!iommu || dmar_disabled)
5855                 return -EINVAL;
5856
5857         if (!sm_supported(iommu) || !pasid_supported(iommu))
5858                 return -EINVAL;
5859
5860         ret = intel_iommu_enable_pasid(iommu, dev);
5861         if (ret)
5862                 return -ENODEV;
5863
5864         spin_lock_irqsave(&device_domain_lock, flags);
5865         info = dev->archdata.iommu;
5866         info->auxd_enabled = 1;
5867         spin_unlock_irqrestore(&device_domain_lock, flags);
5868
5869         return 0;
5870 }
5871
5872 static int intel_iommu_disable_auxd(struct device *dev)
5873 {
5874         struct device_domain_info *info;
5875         unsigned long flags;
5876
5877         spin_lock_irqsave(&device_domain_lock, flags);
5878         info = dev->archdata.iommu;
5879         if (!WARN_ON(!info))
5880                 info->auxd_enabled = 0;
5881         spin_unlock_irqrestore(&device_domain_lock, flags);
5882
5883         return 0;
5884 }
5885
5886 /*
5887  * A PCI express designated vendor specific extended capability is defined
5888  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5889  * for system software and tools to detect endpoint devices supporting the
5890  * Intel scalable IO virtualization without host driver dependency.
5891  *
5892  * Returns the address of the matching extended capability structure within
5893  * the device's PCI configuration space or 0 if the device does not support
5894  * it.
5895  */
5896 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5897 {
5898         int pos;
5899         u16 vendor, id;
5900
5901         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5902         while (pos) {
5903                 pci_read_config_word(pdev, pos + 4, &vendor);
5904                 pci_read_config_word(pdev, pos + 8, &id);
5905                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5906                         return pos;
5907
5908                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5909         }
5910
5911         return 0;
5912 }
5913
5914 static bool
5915 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5916 {
5917         if (feat == IOMMU_DEV_FEAT_AUX) {
5918                 int ret;
5919
5920                 if (!dev_is_pci(dev) || dmar_disabled ||
5921                     !scalable_mode_support() || !iommu_pasid_support())
5922                         return false;
5923
5924                 ret = pci_pasid_features(to_pci_dev(dev));
5925                 if (ret < 0)
5926                         return false;
5927
5928                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5929         }
5930
5931         return false;
5932 }
5933
5934 static int
5935 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5936 {
5937         if (feat == IOMMU_DEV_FEAT_AUX)
5938                 return intel_iommu_enable_auxd(dev);
5939
5940         return -ENODEV;
5941 }
5942
5943 static int
5944 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5945 {
5946         if (feat == IOMMU_DEV_FEAT_AUX)
5947                 return intel_iommu_disable_auxd(dev);
5948
5949         return -ENODEV;
5950 }
5951
5952 static bool
5953 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5954 {
5955         struct device_domain_info *info = dev->archdata.iommu;
5956
5957         if (feat == IOMMU_DEV_FEAT_AUX)
5958                 return scalable_mode_support() && info && info->auxd_enabled;
5959
5960         return false;
5961 }
5962
5963 static int
5964 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5965 {
5966         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5967
5968         return dmar_domain->default_pasid > 0 ?
5969                         dmar_domain->default_pasid : -EINVAL;
5970 }
5971
5972 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5973                                            struct device *dev)
5974 {
5975         return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
5976 }
5977
5978 /*
5979  * Check that the device does not live on an external facing PCI port that is
5980  * marked as untrusted. Such devices should not be able to apply quirks and
5981  * thus not be able to bypass the IOMMU restrictions.
5982  */
5983 static bool risky_device(struct pci_dev *pdev)
5984 {
5985         if (pdev->untrusted) {
5986                 pci_info(pdev,
5987                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
5988                          pdev->vendor, pdev->device);
5989                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
5990                 return true;
5991         }
5992         return false;
5993 }
5994
5995 const struct iommu_ops intel_iommu_ops = {
5996         .capable                = intel_iommu_capable,
5997         .domain_alloc           = intel_iommu_domain_alloc,
5998         .domain_free            = intel_iommu_domain_free,
5999         .attach_dev             = intel_iommu_attach_device,
6000         .detach_dev             = intel_iommu_detach_device,
6001         .aux_attach_dev         = intel_iommu_aux_attach_device,
6002         .aux_detach_dev         = intel_iommu_aux_detach_device,
6003         .aux_get_pasid          = intel_iommu_aux_get_pasid,
6004         .map                    = intel_iommu_map,
6005         .unmap                  = intel_iommu_unmap,
6006         .iova_to_phys           = intel_iommu_iova_to_phys,
6007         .add_device             = intel_iommu_add_device,
6008         .remove_device          = intel_iommu_remove_device,
6009         .get_resv_regions       = intel_iommu_get_resv_regions,
6010         .put_resv_regions       = intel_iommu_put_resv_regions,
6011         .apply_resv_region      = intel_iommu_apply_resv_region,
6012         .device_group           = intel_iommu_device_group,
6013         .dev_has_feat           = intel_iommu_dev_has_feat,
6014         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
6015         .dev_enable_feat        = intel_iommu_dev_enable_feat,
6016         .dev_disable_feat       = intel_iommu_dev_disable_feat,
6017         .is_attach_deferred     = intel_iommu_is_attach_deferred,
6018         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
6019 };
6020
6021 static void quirk_iommu_igfx(struct pci_dev *dev)
6022 {
6023         if (risky_device(dev))
6024                 return;
6025
6026         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6027         dmar_map_gfx = 0;
6028 }
6029
6030 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6031 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6032 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6033 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6034 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6035 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6036 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6037 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6038
6039 /* Broadwell igfx malfunctions with dmar */
6040 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6041 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6042 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6043 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6044 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6045 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6046 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6047 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6048 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6049 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6050 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6051 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6052 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6053 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6054 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6055 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6056 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6057 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6058 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6059 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6060 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6061 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6062 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6063 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6064
6065 static void quirk_iommu_rwbf(struct pci_dev *dev)
6066 {
6067         if (risky_device(dev))
6068                 return;
6069
6070         /*
6071          * Mobile 4 Series Chipset neglects to set RWBF capability,
6072          * but needs it. Same seems to hold for the desktop versions.
6073          */
6074         pci_info(dev, "Forcing write-buffer flush capability\n");
6075         rwbf_quirk = 1;
6076 }
6077
6078 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6079 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6080 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6081 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6082 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6083 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6084 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6085
6086 #define GGC 0x52
6087 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
6088 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
6089 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
6090 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
6091 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
6092 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
6093 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
6094 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
6095
6096 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6097 {
6098         unsigned short ggc;
6099
6100         if (risky_device(dev))
6101                 return;
6102
6103         if (pci_read_config_word(dev, GGC, &ggc))
6104                 return;
6105
6106         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6107                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6108                 dmar_map_gfx = 0;
6109         } else if (dmar_map_gfx) {
6110                 /* we have to ensure the gfx device is idle before we flush */
6111                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6112                 intel_iommu_strict = 1;
6113        }
6114 }
6115 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6116 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6117 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6118 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6119
6120 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6121    ISOCH DMAR unit for the Azalia sound device, but not give it any
6122    TLB entries, which causes it to deadlock. Check for that.  We do
6123    this in a function called from init_dmars(), instead of in a PCI
6124    quirk, because we don't want to print the obnoxious "BIOS broken"
6125    message if VT-d is actually disabled.
6126 */
6127 static void __init check_tylersburg_isoch(void)
6128 {
6129         struct pci_dev *pdev;
6130         uint32_t vtisochctrl;
6131
6132         /* If there's no Azalia in the system anyway, forget it. */
6133         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6134         if (!pdev)
6135                 return;
6136
6137         if (risky_device(pdev)) {
6138                 pci_dev_put(pdev);
6139                 return;
6140         }
6141
6142         pci_dev_put(pdev);
6143
6144         /* System Management Registers. Might be hidden, in which case
6145            we can't do the sanity check. But that's OK, because the
6146            known-broken BIOSes _don't_ actually hide it, so far. */
6147         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6148         if (!pdev)
6149                 return;
6150
6151         if (risky_device(pdev)) {
6152                 pci_dev_put(pdev);
6153                 return;
6154         }
6155
6156         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6157                 pci_dev_put(pdev);
6158                 return;
6159         }
6160
6161         pci_dev_put(pdev);
6162
6163         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6164         if (vtisochctrl & 1)
6165                 return;
6166
6167         /* Drop all bits other than the number of TLB entries */
6168         vtisochctrl &= 0x1c;
6169
6170         /* If we have the recommended number of TLB entries (16), fine. */
6171         if (vtisochctrl == 0x10)
6172                 return;
6173
6174         /* Zero TLB entries? You get to ride the short bus to school. */
6175         if (!vtisochctrl) {
6176                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6177                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6178                      dmi_get_system_info(DMI_BIOS_VENDOR),
6179                      dmi_get_system_info(DMI_BIOS_VERSION),
6180                      dmi_get_system_info(DMI_PRODUCT_VERSION));
6181                 iommu_identity_mapping |= IDENTMAP_AZALIA;
6182                 return;
6183         }
6184
6185         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6186                vtisochctrl);
6187 }