GNU Linux-libre 6.1.24-gnu
[releases.git] / arch / powerpc / kernel / iommu.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
4  * 
5  * Rewrite, cleanup, new allocation schemes, virtual merging: 
6  * Copyright (C) 2004 Olof Johansson, IBM Corporation
7  *               and  Ben. Herrenschmidt, IBM Corporation
8  *
9  * Dynamic DMA mapping support, bus-independent parts.
10  */
11
12
13 #include <linux/init.h>
14 #include <linux/types.h>
15 #include <linux/slab.h>
16 #include <linux/mm.h>
17 #include <linux/spinlock.h>
18 #include <linux/string.h>
19 #include <linux/dma-mapping.h>
20 #include <linux/bitmap.h>
21 #include <linux/iommu-helper.h>
22 #include <linux/crash_dump.h>
23 #include <linux/hash.h>
24 #include <linux/fault-inject.h>
25 #include <linux/pci.h>
26 #include <linux/iommu.h>
27 #include <linux/sched.h>
28 #include <linux/debugfs.h>
29 #include <asm/io.h>
30 #include <asm/iommu.h>
31 #include <asm/pci-bridge.h>
32 #include <asm/machdep.h>
33 #include <asm/kdump.h>
34 #include <asm/fadump.h>
35 #include <asm/vio.h>
36 #include <asm/tce.h>
37 #include <asm/mmu_context.h>
38
39 #define DBG(...)
40
41 #ifdef CONFIG_IOMMU_DEBUGFS
42 static int iommu_debugfs_weight_get(void *data, u64 *val)
43 {
44         struct iommu_table *tbl = data;
45         *val = bitmap_weight(tbl->it_map, tbl->it_size);
46         return 0;
47 }
48 DEFINE_DEBUGFS_ATTRIBUTE(iommu_debugfs_fops_weight, iommu_debugfs_weight_get, NULL, "%llu\n");
49
50 static void iommu_debugfs_add(struct iommu_table *tbl)
51 {
52         char name[10];
53         struct dentry *liobn_entry;
54
55         sprintf(name, "%08lx", tbl->it_index);
56         liobn_entry = debugfs_create_dir(name, iommu_debugfs_dir);
57
58         debugfs_create_file_unsafe("weight", 0400, liobn_entry, tbl, &iommu_debugfs_fops_weight);
59         debugfs_create_ulong("it_size", 0400, liobn_entry, &tbl->it_size);
60         debugfs_create_ulong("it_page_shift", 0400, liobn_entry, &tbl->it_page_shift);
61         debugfs_create_ulong("it_reserved_start", 0400, liobn_entry, &tbl->it_reserved_start);
62         debugfs_create_ulong("it_reserved_end", 0400, liobn_entry, &tbl->it_reserved_end);
63         debugfs_create_ulong("it_indirect_levels", 0400, liobn_entry, &tbl->it_indirect_levels);
64         debugfs_create_ulong("it_level_size", 0400, liobn_entry, &tbl->it_level_size);
65 }
66
67 static void iommu_debugfs_del(struct iommu_table *tbl)
68 {
69         char name[10];
70
71         sprintf(name, "%08lx", tbl->it_index);
72         debugfs_lookup_and_remove(name, iommu_debugfs_dir);
73 }
74 #else
75 static void iommu_debugfs_add(struct iommu_table *tbl){}
76 static void iommu_debugfs_del(struct iommu_table *tbl){}
77 #endif
78
79 static int novmerge;
80
81 static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int);
82
83 static int __init setup_iommu(char *str)
84 {
85         if (!strcmp(str, "novmerge"))
86                 novmerge = 1;
87         else if (!strcmp(str, "vmerge"))
88                 novmerge = 0;
89         return 1;
90 }
91
92 __setup("iommu=", setup_iommu);
93
94 static DEFINE_PER_CPU(unsigned int, iommu_pool_hash);
95
96 /*
97  * We precalculate the hash to avoid doing it on every allocation.
98  *
99  * The hash is important to spread CPUs across all the pools. For example,
100  * on a POWER7 with 4 way SMT we want interrupts on the primary threads and
101  * with 4 pools all primary threads would map to the same pool.
102  */
103 static int __init setup_iommu_pool_hash(void)
104 {
105         unsigned int i;
106
107         for_each_possible_cpu(i)
108                 per_cpu(iommu_pool_hash, i) = hash_32(i, IOMMU_POOL_HASHBITS);
109
110         return 0;
111 }
112 subsys_initcall(setup_iommu_pool_hash);
113
114 #ifdef CONFIG_FAIL_IOMMU
115
116 static DECLARE_FAULT_ATTR(fail_iommu);
117
118 static int __init setup_fail_iommu(char *str)
119 {
120         return setup_fault_attr(&fail_iommu, str);
121 }
122 __setup("fail_iommu=", setup_fail_iommu);
123
124 static bool should_fail_iommu(struct device *dev)
125 {
126         return dev->archdata.fail_iommu && should_fail(&fail_iommu, 1);
127 }
128
129 static int __init fail_iommu_debugfs(void)
130 {
131         struct dentry *dir = fault_create_debugfs_attr("fail_iommu",
132                                                        NULL, &fail_iommu);
133
134         return PTR_ERR_OR_ZERO(dir);
135 }
136 late_initcall(fail_iommu_debugfs);
137
138 static ssize_t fail_iommu_show(struct device *dev,
139                                struct device_attribute *attr, char *buf)
140 {
141         return sprintf(buf, "%d\n", dev->archdata.fail_iommu);
142 }
143
144 static ssize_t fail_iommu_store(struct device *dev,
145                                 struct device_attribute *attr, const char *buf,
146                                 size_t count)
147 {
148         int i;
149
150         if (count > 0 && sscanf(buf, "%d", &i) > 0)
151                 dev->archdata.fail_iommu = (i == 0) ? 0 : 1;
152
153         return count;
154 }
155
156 static DEVICE_ATTR_RW(fail_iommu);
157
158 static int fail_iommu_bus_notify(struct notifier_block *nb,
159                                  unsigned long action, void *data)
160 {
161         struct device *dev = data;
162
163         if (action == BUS_NOTIFY_ADD_DEVICE) {
164                 if (device_create_file(dev, &dev_attr_fail_iommu))
165                         pr_warn("Unable to create IOMMU fault injection sysfs "
166                                 "entries\n");
167         } else if (action == BUS_NOTIFY_DEL_DEVICE) {
168                 device_remove_file(dev, &dev_attr_fail_iommu);
169         }
170
171         return 0;
172 }
173
174 static struct notifier_block fail_iommu_bus_notifier = {
175         .notifier_call = fail_iommu_bus_notify
176 };
177
178 static int __init fail_iommu_setup(void)
179 {
180 #ifdef CONFIG_PCI
181         bus_register_notifier(&pci_bus_type, &fail_iommu_bus_notifier);
182 #endif
183 #ifdef CONFIG_IBMVIO
184         bus_register_notifier(&vio_bus_type, &fail_iommu_bus_notifier);
185 #endif
186
187         return 0;
188 }
189 /*
190  * Must execute after PCI and VIO subsystem have initialised but before
191  * devices are probed.
192  */
193 arch_initcall(fail_iommu_setup);
194 #else
195 static inline bool should_fail_iommu(struct device *dev)
196 {
197         return false;
198 }
199 #endif
200
201 static unsigned long iommu_range_alloc(struct device *dev,
202                                        struct iommu_table *tbl,
203                                        unsigned long npages,
204                                        unsigned long *handle,
205                                        unsigned long mask,
206                                        unsigned int align_order)
207
208         unsigned long n, end, start;
209         unsigned long limit;
210         int largealloc = npages > 15;
211         int pass = 0;
212         unsigned long align_mask;
213         unsigned long flags;
214         unsigned int pool_nr;
215         struct iommu_pool *pool;
216
217         align_mask = (1ull << align_order) - 1;
218
219         /* This allocator was derived from x86_64's bit string search */
220
221         /* Sanity check */
222         if (unlikely(npages == 0)) {
223                 if (printk_ratelimit())
224                         WARN_ON(1);
225                 return DMA_MAPPING_ERROR;
226         }
227
228         if (should_fail_iommu(dev))
229                 return DMA_MAPPING_ERROR;
230
231         /*
232          * We don't need to disable preemption here because any CPU can
233          * safely use any IOMMU pool.
234          */
235         pool_nr = raw_cpu_read(iommu_pool_hash) & (tbl->nr_pools - 1);
236
237         if (largealloc)
238                 pool = &(tbl->large_pool);
239         else
240                 pool = &(tbl->pools[pool_nr]);
241
242         spin_lock_irqsave(&(pool->lock), flags);
243
244 again:
245         if ((pass == 0) && handle && *handle &&
246             (*handle >= pool->start) && (*handle < pool->end))
247                 start = *handle;
248         else
249                 start = pool->hint;
250
251         limit = pool->end;
252
253         /* The case below can happen if we have a small segment appended
254          * to a large, or when the previous alloc was at the very end of
255          * the available space. If so, go back to the initial start.
256          */
257         if (start >= limit)
258                 start = pool->start;
259
260         if (limit + tbl->it_offset > mask) {
261                 limit = mask - tbl->it_offset + 1;
262                 /* If we're constrained on address range, first try
263                  * at the masked hint to avoid O(n) search complexity,
264                  * but on second pass, start at 0 in pool 0.
265                  */
266                 if ((start & mask) >= limit || pass > 0) {
267                         spin_unlock(&(pool->lock));
268                         pool = &(tbl->pools[0]);
269                         spin_lock(&(pool->lock));
270                         start = pool->start;
271                 } else {
272                         start &= mask;
273                 }
274         }
275
276         n = iommu_area_alloc(tbl->it_map, limit, start, npages, tbl->it_offset,
277                         dma_get_seg_boundary_nr_pages(dev, tbl->it_page_shift),
278                         align_mask);
279         if (n == -1) {
280                 if (likely(pass == 0)) {
281                         /* First try the pool from the start */
282                         pool->hint = pool->start;
283                         pass++;
284                         goto again;
285
286                 } else if (pass <= tbl->nr_pools) {
287                         /* Now try scanning all the other pools */
288                         spin_unlock(&(pool->lock));
289                         pool_nr = (pool_nr + 1) & (tbl->nr_pools - 1);
290                         pool = &tbl->pools[pool_nr];
291                         spin_lock(&(pool->lock));
292                         pool->hint = pool->start;
293                         pass++;
294                         goto again;
295
296                 } else if (pass == tbl->nr_pools + 1) {
297                         /* Last resort: try largepool */
298                         spin_unlock(&pool->lock);
299                         pool = &tbl->large_pool;
300                         spin_lock(&pool->lock);
301                         pool->hint = pool->start;
302                         pass++;
303                         goto again;
304
305                 } else {
306                         /* Give up */
307                         spin_unlock_irqrestore(&(pool->lock), flags);
308                         return DMA_MAPPING_ERROR;
309                 }
310         }
311
312         end = n + npages;
313
314         /* Bump the hint to a new block for small allocs. */
315         if (largealloc) {
316                 /* Don't bump to new block to avoid fragmentation */
317                 pool->hint = end;
318         } else {
319                 /* Overflow will be taken care of at the next allocation */
320                 pool->hint = (end + tbl->it_blocksize - 1) &
321                                 ~(tbl->it_blocksize - 1);
322         }
323
324         /* Update handle for SG allocations */
325         if (handle)
326                 *handle = end;
327
328         spin_unlock_irqrestore(&(pool->lock), flags);
329
330         return n;
331 }
332
333 static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
334                               void *page, unsigned int npages,
335                               enum dma_data_direction direction,
336                               unsigned long mask, unsigned int align_order,
337                               unsigned long attrs)
338 {
339         unsigned long entry;
340         dma_addr_t ret = DMA_MAPPING_ERROR;
341         int build_fail;
342
343         entry = iommu_range_alloc(dev, tbl, npages, NULL, mask, align_order);
344
345         if (unlikely(entry == DMA_MAPPING_ERROR))
346                 return DMA_MAPPING_ERROR;
347
348         entry += tbl->it_offset;        /* Offset into real TCE table */
349         ret = entry << tbl->it_page_shift;      /* Set the return dma address */
350
351         /* Put the TCEs in the HW table */
352         build_fail = tbl->it_ops->set(tbl, entry, npages,
353                                       (unsigned long)page &
354                                       IOMMU_PAGE_MASK(tbl), direction, attrs);
355
356         /* tbl->it_ops->set() only returns non-zero for transient errors.
357          * Clean up the table bitmap in this case and return
358          * DMA_MAPPING_ERROR. For all other errors the functionality is
359          * not altered.
360          */
361         if (unlikely(build_fail)) {
362                 __iommu_free(tbl, ret, npages);
363                 return DMA_MAPPING_ERROR;
364         }
365
366         /* Flush/invalidate TLB caches if necessary */
367         if (tbl->it_ops->flush)
368                 tbl->it_ops->flush(tbl);
369
370         /* Make sure updates are seen by hardware */
371         mb();
372
373         return ret;
374 }
375
376 static bool iommu_free_check(struct iommu_table *tbl, dma_addr_t dma_addr,
377                              unsigned int npages)
378 {
379         unsigned long entry, free_entry;
380
381         entry = dma_addr >> tbl->it_page_shift;
382         free_entry = entry - tbl->it_offset;
383
384         if (((free_entry + npages) > tbl->it_size) ||
385             (entry < tbl->it_offset)) {
386                 if (printk_ratelimit()) {
387                         printk(KERN_INFO "iommu_free: invalid entry\n");
388                         printk(KERN_INFO "\tentry     = 0x%lx\n", entry); 
389                         printk(KERN_INFO "\tdma_addr  = 0x%llx\n", (u64)dma_addr);
390                         printk(KERN_INFO "\tTable     = 0x%llx\n", (u64)tbl);
391                         printk(KERN_INFO "\tbus#      = 0x%llx\n", (u64)tbl->it_busno);
392                         printk(KERN_INFO "\tsize      = 0x%llx\n", (u64)tbl->it_size);
393                         printk(KERN_INFO "\tstartOff  = 0x%llx\n", (u64)tbl->it_offset);
394                         printk(KERN_INFO "\tindex     = 0x%llx\n", (u64)tbl->it_index);
395                         WARN_ON(1);
396                 }
397
398                 return false;
399         }
400
401         return true;
402 }
403
404 static struct iommu_pool *get_pool(struct iommu_table *tbl,
405                                    unsigned long entry)
406 {
407         struct iommu_pool *p;
408         unsigned long largepool_start = tbl->large_pool.start;
409
410         /* The large pool is the last pool at the top of the table */
411         if (entry >= largepool_start) {
412                 p = &tbl->large_pool;
413         } else {
414                 unsigned int pool_nr = entry / tbl->poolsize;
415
416                 BUG_ON(pool_nr > tbl->nr_pools);
417                 p = &tbl->pools[pool_nr];
418         }
419
420         return p;
421 }
422
423 static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
424                          unsigned int npages)
425 {
426         unsigned long entry, free_entry;
427         unsigned long flags;
428         struct iommu_pool *pool;
429
430         entry = dma_addr >> tbl->it_page_shift;
431         free_entry = entry - tbl->it_offset;
432
433         pool = get_pool(tbl, free_entry);
434
435         if (!iommu_free_check(tbl, dma_addr, npages))
436                 return;
437
438         tbl->it_ops->clear(tbl, entry, npages);
439
440         spin_lock_irqsave(&(pool->lock), flags);
441         bitmap_clear(tbl->it_map, free_entry, npages);
442         spin_unlock_irqrestore(&(pool->lock), flags);
443 }
444
445 static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
446                 unsigned int npages)
447 {
448         __iommu_free(tbl, dma_addr, npages);
449
450         /* Make sure TLB cache is flushed if the HW needs it. We do
451          * not do an mb() here on purpose, it is not needed on any of
452          * the current platforms.
453          */
454         if (tbl->it_ops->flush)
455                 tbl->it_ops->flush(tbl);
456 }
457
458 int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
459                      struct scatterlist *sglist, int nelems,
460                      unsigned long mask, enum dma_data_direction direction,
461                      unsigned long attrs)
462 {
463         dma_addr_t dma_next = 0, dma_addr;
464         struct scatterlist *s, *outs, *segstart;
465         int outcount, incount, i, build_fail = 0;
466         unsigned int align;
467         unsigned long handle;
468         unsigned int max_seg_size;
469
470         BUG_ON(direction == DMA_NONE);
471
472         if ((nelems == 0) || !tbl)
473                 return -EINVAL;
474
475         outs = s = segstart = &sglist[0];
476         outcount = 1;
477         incount = nelems;
478         handle = 0;
479
480         /* Init first segment length for backout at failure */
481         outs->dma_length = 0;
482
483         DBG("sg mapping %d elements:\n", nelems);
484
485         max_seg_size = dma_get_max_seg_size(dev);
486         for_each_sg(sglist, s, nelems, i) {
487                 unsigned long vaddr, npages, entry, slen;
488
489                 slen = s->length;
490                 /* Sanity check */
491                 if (slen == 0) {
492                         dma_next = 0;
493                         continue;
494                 }
495                 /* Allocate iommu entries for that segment */
496                 vaddr = (unsigned long) sg_virt(s);
497                 npages = iommu_num_pages(vaddr, slen, IOMMU_PAGE_SIZE(tbl));
498                 align = 0;
499                 if (tbl->it_page_shift < PAGE_SHIFT && slen >= PAGE_SIZE &&
500                     (vaddr & ~PAGE_MASK) == 0)
501                         align = PAGE_SHIFT - tbl->it_page_shift;
502                 entry = iommu_range_alloc(dev, tbl, npages, &handle,
503                                           mask >> tbl->it_page_shift, align);
504
505                 DBG("  - vaddr: %lx, size: %lx\n", vaddr, slen);
506
507                 /* Handle failure */
508                 if (unlikely(entry == DMA_MAPPING_ERROR)) {
509                         if (!(attrs & DMA_ATTR_NO_WARN) &&
510                             printk_ratelimit())
511                                 dev_info(dev, "iommu_alloc failed, tbl %p "
512                                          "vaddr %lx npages %lu\n", tbl, vaddr,
513                                          npages);
514                         goto failure;
515                 }
516
517                 /* Convert entry to a dma_addr_t */
518                 entry += tbl->it_offset;
519                 dma_addr = entry << tbl->it_page_shift;
520                 dma_addr |= (s->offset & ~IOMMU_PAGE_MASK(tbl));
521
522                 DBG("  - %lu pages, entry: %lx, dma_addr: %lx\n",
523                             npages, entry, dma_addr);
524
525                 /* Insert into HW table */
526                 build_fail = tbl->it_ops->set(tbl, entry, npages,
527                                               vaddr & IOMMU_PAGE_MASK(tbl),
528                                               direction, attrs);
529                 if(unlikely(build_fail))
530                         goto failure;
531
532                 /* If we are in an open segment, try merging */
533                 if (segstart != s) {
534                         DBG("  - trying merge...\n");
535                         /* We cannot merge if:
536                          * - allocated dma_addr isn't contiguous to previous allocation
537                          */
538                         if (novmerge || (dma_addr != dma_next) ||
539                             (outs->dma_length + s->length > max_seg_size)) {
540                                 /* Can't merge: create a new segment */
541                                 segstart = s;
542                                 outcount++;
543                                 outs = sg_next(outs);
544                                 DBG("    can't merge, new segment.\n");
545                         } else {
546                                 outs->dma_length += s->length;
547                                 DBG("    merged, new len: %ux\n", outs->dma_length);
548                         }
549                 }
550
551                 if (segstart == s) {
552                         /* This is a new segment, fill entries */
553                         DBG("  - filling new segment.\n");
554                         outs->dma_address = dma_addr;
555                         outs->dma_length = slen;
556                 }
557
558                 /* Calculate next page pointer for contiguous check */
559                 dma_next = dma_addr + slen;
560
561                 DBG("  - dma next is: %lx\n", dma_next);
562         }
563
564         /* Flush/invalidate TLB caches if necessary */
565         if (tbl->it_ops->flush)
566                 tbl->it_ops->flush(tbl);
567
568         DBG("mapped %d elements:\n", outcount);
569
570         /* For the sake of ppc_iommu_unmap_sg, we clear out the length in the
571          * next entry of the sglist if we didn't fill the list completely
572          */
573         if (outcount < incount) {
574                 outs = sg_next(outs);
575                 outs->dma_length = 0;
576         }
577
578         /* Make sure updates are seen by hardware */
579         mb();
580
581         return outcount;
582
583  failure:
584         for_each_sg(sglist, s, nelems, i) {
585                 if (s->dma_length != 0) {
586                         unsigned long vaddr, npages;
587
588                         vaddr = s->dma_address & IOMMU_PAGE_MASK(tbl);
589                         npages = iommu_num_pages(s->dma_address, s->dma_length,
590                                                  IOMMU_PAGE_SIZE(tbl));
591                         __iommu_free(tbl, vaddr, npages);
592                         s->dma_length = 0;
593                 }
594                 if (s == outs)
595                         break;
596         }
597         return -EIO;
598 }
599
600
601 void ppc_iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
602                         int nelems, enum dma_data_direction direction,
603                         unsigned long attrs)
604 {
605         struct scatterlist *sg;
606
607         BUG_ON(direction == DMA_NONE);
608
609         if (!tbl)
610                 return;
611
612         sg = sglist;
613         while (nelems--) {
614                 unsigned int npages;
615                 dma_addr_t dma_handle = sg->dma_address;
616
617                 if (sg->dma_length == 0)
618                         break;
619                 npages = iommu_num_pages(dma_handle, sg->dma_length,
620                                          IOMMU_PAGE_SIZE(tbl));
621                 __iommu_free(tbl, dma_handle, npages);
622                 sg = sg_next(sg);
623         }
624
625         /* Flush/invalidate TLBs if necessary. As for iommu_free(), we
626          * do not do an mb() here, the affected platforms do not need it
627          * when freeing.
628          */
629         if (tbl->it_ops->flush)
630                 tbl->it_ops->flush(tbl);
631 }
632
633 static void iommu_table_clear(struct iommu_table *tbl)
634 {
635         /*
636          * In case of firmware assisted dump system goes through clean
637          * reboot process at the time of system crash. Hence it's safe to
638          * clear the TCE entries if firmware assisted dump is active.
639          */
640         if (!is_kdump_kernel() || is_fadump_active()) {
641                 /* Clear the table in case firmware left allocations in it */
642                 tbl->it_ops->clear(tbl, tbl->it_offset, tbl->it_size);
643                 return;
644         }
645
646 #ifdef CONFIG_CRASH_DUMP
647         if (tbl->it_ops->get) {
648                 unsigned long index, tceval, tcecount = 0;
649
650                 /* Reserve the existing mappings left by the first kernel. */
651                 for (index = 0; index < tbl->it_size; index++) {
652                         tceval = tbl->it_ops->get(tbl, index + tbl->it_offset);
653                         /*
654                          * Freed TCE entry contains 0x7fffffffffffffff on JS20
655                          */
656                         if (tceval && (tceval != 0x7fffffffffffffffUL)) {
657                                 __set_bit(index, tbl->it_map);
658                                 tcecount++;
659                         }
660                 }
661
662                 if ((tbl->it_size - tcecount) < KDUMP_MIN_TCE_ENTRIES) {
663                         printk(KERN_WARNING "TCE table is full; freeing ");
664                         printk(KERN_WARNING "%d entries for the kdump boot\n",
665                                 KDUMP_MIN_TCE_ENTRIES);
666                         for (index = tbl->it_size - KDUMP_MIN_TCE_ENTRIES;
667                                 index < tbl->it_size; index++)
668                                 __clear_bit(index, tbl->it_map);
669                 }
670         }
671 #endif
672 }
673
674 static void iommu_table_reserve_pages(struct iommu_table *tbl,
675                 unsigned long res_start, unsigned long res_end)
676 {
677         int i;
678
679         WARN_ON_ONCE(res_end < res_start);
680         /*
681          * Reserve page 0 so it will not be used for any mappings.
682          * This avoids buggy drivers that consider page 0 to be invalid
683          * to crash the machine or even lose data.
684          */
685         if (tbl->it_offset == 0)
686                 set_bit(0, tbl->it_map);
687
688         if (res_start < tbl->it_offset)
689                 res_start = tbl->it_offset;
690
691         if (res_end > (tbl->it_offset + tbl->it_size))
692                 res_end = tbl->it_offset + tbl->it_size;
693
694         /* Check if res_start..res_end is a valid range in the table */
695         if (res_start >= res_end) {
696                 tbl->it_reserved_start = tbl->it_offset;
697                 tbl->it_reserved_end = tbl->it_offset;
698                 return;
699         }
700
701         tbl->it_reserved_start = res_start;
702         tbl->it_reserved_end = res_end;
703
704         for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)
705                 set_bit(i - tbl->it_offset, tbl->it_map);
706 }
707
708 /*
709  * Build a iommu_table structure.  This contains a bit map which
710  * is used to manage allocation of the tce space.
711  */
712 struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid,
713                 unsigned long res_start, unsigned long res_end)
714 {
715         unsigned long sz;
716         static int welcomed = 0;
717         unsigned int i;
718         struct iommu_pool *p;
719
720         BUG_ON(!tbl->it_ops);
721
722         /* number of bytes needed for the bitmap */
723         sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
724
725         tbl->it_map = vzalloc_node(sz, nid);
726         if (!tbl->it_map) {
727                 pr_err("%s: Can't allocate %ld bytes\n", __func__, sz);
728                 return NULL;
729         }
730
731         iommu_table_reserve_pages(tbl, res_start, res_end);
732
733         /* We only split the IOMMU table if we have 1GB or more of space */
734         if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024))
735                 tbl->nr_pools = IOMMU_NR_POOLS;
736         else
737                 tbl->nr_pools = 1;
738
739         /* We reserve the top 1/4 of the table for large allocations */
740         tbl->poolsize = (tbl->it_size * 3 / 4) / tbl->nr_pools;
741
742         for (i = 0; i < tbl->nr_pools; i++) {
743                 p = &tbl->pools[i];
744                 spin_lock_init(&(p->lock));
745                 p->start = tbl->poolsize * i;
746                 p->hint = p->start;
747                 p->end = p->start + tbl->poolsize;
748         }
749
750         p = &tbl->large_pool;
751         spin_lock_init(&(p->lock));
752         p->start = tbl->poolsize * i;
753         p->hint = p->start;
754         p->end = tbl->it_size;
755
756         iommu_table_clear(tbl);
757
758         if (!welcomed) {
759                 printk(KERN_INFO "IOMMU table initialized, virtual merging %s\n",
760                        novmerge ? "disabled" : "enabled");
761                 welcomed = 1;
762         }
763
764         iommu_debugfs_add(tbl);
765
766         return tbl;
767 }
768
769 bool iommu_table_in_use(struct iommu_table *tbl)
770 {
771         unsigned long start = 0, end;
772
773         /* ignore reserved bit0 */
774         if (tbl->it_offset == 0)
775                 start = 1;
776
777         /* Simple case with no reserved MMIO32 region */
778         if (!tbl->it_reserved_start && !tbl->it_reserved_end)
779                 return find_next_bit(tbl->it_map, tbl->it_size, start) != tbl->it_size;
780
781         end = tbl->it_reserved_start - tbl->it_offset;
782         if (find_next_bit(tbl->it_map, end, start) != end)
783                 return true;
784
785         start = tbl->it_reserved_end - tbl->it_offset;
786         end = tbl->it_size;
787         return find_next_bit(tbl->it_map, end, start) != end;
788 }
789
790 static void iommu_table_free(struct kref *kref)
791 {
792         struct iommu_table *tbl;
793
794         tbl = container_of(kref, struct iommu_table, it_kref);
795
796         if (tbl->it_ops->free)
797                 tbl->it_ops->free(tbl);
798
799         if (!tbl->it_map) {
800                 kfree(tbl);
801                 return;
802         }
803
804         iommu_debugfs_del(tbl);
805
806         /* verify that table contains no entries */
807         if (iommu_table_in_use(tbl))
808                 pr_warn("%s: Unexpected TCEs\n", __func__);
809
810         /* free bitmap */
811         vfree(tbl->it_map);
812
813         /* free table */
814         kfree(tbl);
815 }
816
817 struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl)
818 {
819         if (kref_get_unless_zero(&tbl->it_kref))
820                 return tbl;
821
822         return NULL;
823 }
824 EXPORT_SYMBOL_GPL(iommu_tce_table_get);
825
826 int iommu_tce_table_put(struct iommu_table *tbl)
827 {
828         if (WARN_ON(!tbl))
829                 return 0;
830
831         return kref_put(&tbl->it_kref, iommu_table_free);
832 }
833 EXPORT_SYMBOL_GPL(iommu_tce_table_put);
834
835 /* Creates TCEs for a user provided buffer.  The user buffer must be
836  * contiguous real kernel storage (not vmalloc).  The address passed here
837  * comprises a page address and offset into that page. The dma_addr_t
838  * returned will point to the same byte within the page as was passed in.
839  */
840 dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl,
841                           struct page *page, unsigned long offset, size_t size,
842                           unsigned long mask, enum dma_data_direction direction,
843                           unsigned long attrs)
844 {
845         dma_addr_t dma_handle = DMA_MAPPING_ERROR;
846         void *vaddr;
847         unsigned long uaddr;
848         unsigned int npages, align;
849
850         BUG_ON(direction == DMA_NONE);
851
852         vaddr = page_address(page) + offset;
853         uaddr = (unsigned long)vaddr;
854
855         if (tbl) {
856                 npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE(tbl));
857                 align = 0;
858                 if (tbl->it_page_shift < PAGE_SHIFT && size >= PAGE_SIZE &&
859                     ((unsigned long)vaddr & ~PAGE_MASK) == 0)
860                         align = PAGE_SHIFT - tbl->it_page_shift;
861
862                 dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction,
863                                          mask >> tbl->it_page_shift, align,
864                                          attrs);
865                 if (dma_handle == DMA_MAPPING_ERROR) {
866                         if (!(attrs & DMA_ATTR_NO_WARN) &&
867                             printk_ratelimit())  {
868                                 dev_info(dev, "iommu_alloc failed, tbl %p "
869                                          "vaddr %p npages %d\n", tbl, vaddr,
870                                          npages);
871                         }
872                 } else
873                         dma_handle |= (uaddr & ~IOMMU_PAGE_MASK(tbl));
874         }
875
876         return dma_handle;
877 }
878
879 void iommu_unmap_page(struct iommu_table *tbl, dma_addr_t dma_handle,
880                       size_t size, enum dma_data_direction direction,
881                       unsigned long attrs)
882 {
883         unsigned int npages;
884
885         BUG_ON(direction == DMA_NONE);
886
887         if (tbl) {
888                 npages = iommu_num_pages(dma_handle, size,
889                                          IOMMU_PAGE_SIZE(tbl));
890                 iommu_free(tbl, dma_handle, npages);
891         }
892 }
893
894 /* Allocates a contiguous real buffer and creates mappings over it.
895  * Returns the virtual address of the buffer and sets dma_handle
896  * to the dma address (mapping) of the first page.
897  */
898 void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
899                            size_t size, dma_addr_t *dma_handle,
900                            unsigned long mask, gfp_t flag, int node)
901 {
902         void *ret = NULL;
903         dma_addr_t mapping;
904         unsigned int order;
905         unsigned int nio_pages, io_order;
906         struct page *page;
907
908         size = PAGE_ALIGN(size);
909         order = get_order(size);
910
911         /*
912          * Client asked for way too much space.  This is checked later
913          * anyway.  It is easier to debug here for the drivers than in
914          * the tce tables.
915          */
916         if (order >= IOMAP_MAX_ORDER) {
917                 dev_info(dev, "iommu_alloc_consistent size too large: 0x%lx\n",
918                          size);
919                 return NULL;
920         }
921
922         if (!tbl)
923                 return NULL;
924
925         /* Alloc enough pages (and possibly more) */
926         page = alloc_pages_node(node, flag, order);
927         if (!page)
928                 return NULL;
929         ret = page_address(page);
930         memset(ret, 0, size);
931
932         /* Set up tces to cover the allocated range */
933         nio_pages = size >> tbl->it_page_shift;
934         io_order = get_iommu_order(size, tbl);
935         mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL,
936                               mask >> tbl->it_page_shift, io_order, 0);
937         if (mapping == DMA_MAPPING_ERROR) {
938                 free_pages((unsigned long)ret, order);
939                 return NULL;
940         }
941         *dma_handle = mapping;
942         return ret;
943 }
944
945 void iommu_free_coherent(struct iommu_table *tbl, size_t size,
946                          void *vaddr, dma_addr_t dma_handle)
947 {
948         if (tbl) {
949                 unsigned int nio_pages;
950
951                 size = PAGE_ALIGN(size);
952                 nio_pages = size >> tbl->it_page_shift;
953                 iommu_free(tbl, dma_handle, nio_pages);
954                 size = PAGE_ALIGN(size);
955                 free_pages((unsigned long)vaddr, get_order(size));
956         }
957 }
958
959 unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir)
960 {
961         switch (dir) {
962         case DMA_BIDIRECTIONAL:
963                 return TCE_PCI_READ | TCE_PCI_WRITE;
964         case DMA_FROM_DEVICE:
965                 return TCE_PCI_WRITE;
966         case DMA_TO_DEVICE:
967                 return TCE_PCI_READ;
968         default:
969                 return 0;
970         }
971 }
972 EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm);
973
974 #ifdef CONFIG_IOMMU_API
975 /*
976  * SPAPR TCE API
977  */
978 static void group_release(void *iommu_data)
979 {
980         struct iommu_table_group *table_group = iommu_data;
981
982         table_group->group = NULL;
983 }
984
985 void iommu_register_group(struct iommu_table_group *table_group,
986                 int pci_domain_number, unsigned long pe_num)
987 {
988         struct iommu_group *grp;
989         char *name;
990
991         grp = iommu_group_alloc();
992         if (IS_ERR(grp)) {
993                 pr_warn("powerpc iommu api: cannot create new group, err=%ld\n",
994                                 PTR_ERR(grp));
995                 return;
996         }
997         table_group->group = grp;
998         iommu_group_set_iommudata(grp, table_group, group_release);
999         name = kasprintf(GFP_KERNEL, "domain%d-pe%lx",
1000                         pci_domain_number, pe_num);
1001         if (!name)
1002                 return;
1003         iommu_group_set_name(grp, name);
1004         kfree(name);
1005 }
1006
1007 enum dma_data_direction iommu_tce_direction(unsigned long tce)
1008 {
1009         if ((tce & TCE_PCI_READ) && (tce & TCE_PCI_WRITE))
1010                 return DMA_BIDIRECTIONAL;
1011         else if (tce & TCE_PCI_READ)
1012                 return DMA_TO_DEVICE;
1013         else if (tce & TCE_PCI_WRITE)
1014                 return DMA_FROM_DEVICE;
1015         else
1016                 return DMA_NONE;
1017 }
1018 EXPORT_SYMBOL_GPL(iommu_tce_direction);
1019
1020 void iommu_flush_tce(struct iommu_table *tbl)
1021 {
1022         /* Flush/invalidate TLB caches if necessary */
1023         if (tbl->it_ops->flush)
1024                 tbl->it_ops->flush(tbl);
1025
1026         /* Make sure updates are seen by hardware */
1027         mb();
1028 }
1029 EXPORT_SYMBOL_GPL(iommu_flush_tce);
1030
1031 int iommu_tce_check_ioba(unsigned long page_shift,
1032                 unsigned long offset, unsigned long size,
1033                 unsigned long ioba, unsigned long npages)
1034 {
1035         unsigned long mask = (1UL << page_shift) - 1;
1036
1037         if (ioba & mask)
1038                 return -EINVAL;
1039
1040         ioba >>= page_shift;
1041         if (ioba < offset)
1042                 return -EINVAL;
1043
1044         if ((ioba + 1) > (offset + size))
1045                 return -EINVAL;
1046
1047         return 0;
1048 }
1049 EXPORT_SYMBOL_GPL(iommu_tce_check_ioba);
1050
1051 int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa)
1052 {
1053         unsigned long mask = (1UL << page_shift) - 1;
1054
1055         if (gpa & mask)
1056                 return -EINVAL;
1057
1058         return 0;
1059 }
1060 EXPORT_SYMBOL_GPL(iommu_tce_check_gpa);
1061
1062 extern long iommu_tce_xchg_no_kill(struct mm_struct *mm,
1063                 struct iommu_table *tbl,
1064                 unsigned long entry, unsigned long *hpa,
1065                 enum dma_data_direction *direction)
1066 {
1067         long ret;
1068         unsigned long size = 0;
1069
1070         ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction);
1071         if (!ret && ((*direction == DMA_FROM_DEVICE) ||
1072                         (*direction == DMA_BIDIRECTIONAL)) &&
1073                         !mm_iommu_is_devmem(mm, *hpa, tbl->it_page_shift,
1074                                         &size))
1075                 SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT));
1076
1077         return ret;
1078 }
1079 EXPORT_SYMBOL_GPL(iommu_tce_xchg_no_kill);
1080
1081 void iommu_tce_kill(struct iommu_table *tbl,
1082                 unsigned long entry, unsigned long pages)
1083 {
1084         if (tbl->it_ops->tce_kill)
1085                 tbl->it_ops->tce_kill(tbl, entry, pages);
1086 }
1087 EXPORT_SYMBOL_GPL(iommu_tce_kill);
1088
1089 int iommu_take_ownership(struct iommu_table *tbl)
1090 {
1091         unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
1092         int ret = 0;
1093
1094         /*
1095          * VFIO does not control TCE entries allocation and the guest
1096          * can write new TCEs on top of existing ones so iommu_tce_build()
1097          * must be able to release old pages. This functionality
1098          * requires exchange() callback defined so if it is not
1099          * implemented, we disallow taking ownership over the table.
1100          */
1101         if (!tbl->it_ops->xchg_no_kill)
1102                 return -EINVAL;
1103
1104         spin_lock_irqsave(&tbl->large_pool.lock, flags);
1105         for (i = 0; i < tbl->nr_pools; i++)
1106                 spin_lock_nest_lock(&tbl->pools[i].lock, &tbl->large_pool.lock);
1107
1108         if (iommu_table_in_use(tbl)) {
1109                 pr_err("iommu_tce: it_map is not empty");
1110                 ret = -EBUSY;
1111         } else {
1112                 memset(tbl->it_map, 0xff, sz);
1113         }
1114
1115         for (i = 0; i < tbl->nr_pools; i++)
1116                 spin_unlock(&tbl->pools[i].lock);
1117         spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
1118
1119         return ret;
1120 }
1121 EXPORT_SYMBOL_GPL(iommu_take_ownership);
1122
1123 void iommu_release_ownership(struct iommu_table *tbl)
1124 {
1125         unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
1126
1127         spin_lock_irqsave(&tbl->large_pool.lock, flags);
1128         for (i = 0; i < tbl->nr_pools; i++)
1129                 spin_lock_nest_lock(&tbl->pools[i].lock, &tbl->large_pool.lock);
1130
1131         memset(tbl->it_map, 0, sz);
1132
1133         iommu_table_reserve_pages(tbl, tbl->it_reserved_start,
1134                         tbl->it_reserved_end);
1135
1136         for (i = 0; i < tbl->nr_pools; i++)
1137                 spin_unlock(&tbl->pools[i].lock);
1138         spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
1139 }
1140 EXPORT_SYMBOL_GPL(iommu_release_ownership);
1141
1142 int iommu_add_device(struct iommu_table_group *table_group, struct device *dev)
1143 {
1144         /*
1145          * The sysfs entries should be populated before
1146          * binding IOMMU group. If sysfs entries isn't
1147          * ready, we simply bail.
1148          */
1149         if (!device_is_registered(dev))
1150                 return -ENOENT;
1151
1152         if (device_iommu_mapped(dev)) {
1153                 pr_debug("%s: Skipping device %s with iommu group %d\n",
1154                          __func__, dev_name(dev),
1155                          iommu_group_id(dev->iommu_group));
1156                 return -EBUSY;
1157         }
1158
1159         pr_debug("%s: Adding %s to iommu group %d\n",
1160                  __func__, dev_name(dev),  iommu_group_id(table_group->group));
1161
1162         return iommu_group_add_device(table_group->group, dev);
1163 }
1164 EXPORT_SYMBOL_GPL(iommu_add_device);
1165
1166 void iommu_del_device(struct device *dev)
1167 {
1168         /*
1169          * Some devices might not have IOMMU table and group
1170          * and we needn't detach them from the associated
1171          * IOMMU groups
1172          */
1173         if (!device_iommu_mapped(dev)) {
1174                 pr_debug("iommu_tce: skipping device %s with no tbl\n",
1175                          dev_name(dev));
1176                 return;
1177         }
1178
1179         iommu_group_remove_device(dev);
1180 }
1181 EXPORT_SYMBOL_GPL(iommu_del_device);
1182 #endif /* CONFIG_IOMMU_API */