GNU Linux-libre 4.19.263-gnu1
[releases.git] / virt / kvm / kvm_main.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9  *
10  * Authors:
11  *   Avi Kivity   <avi@qumranet.com>
12  *   Yaniv Kamay  <yaniv@qumranet.com>
13  *
14  * This work is licensed under the terms of the GNU GPL, version 2.  See
15  * the COPYING file in the top-level directory.
16  *
17  */
18
19 #include <kvm/iodev.h>
20
21 #include <linux/kvm_host.h>
22 #include <linux/kvm.h>
23 #include <linux/module.h>
24 #include <linux/errno.h>
25 #include <linux/percpu.h>
26 #include <linux/mm.h>
27 #include <linux/miscdevice.h>
28 #include <linux/vmalloc.h>
29 #include <linux/reboot.h>
30 #include <linux/debugfs.h>
31 #include <linux/highmem.h>
32 #include <linux/file.h>
33 #include <linux/syscore_ops.h>
34 #include <linux/cpu.h>
35 #include <linux/sched/signal.h>
36 #include <linux/sched/mm.h>
37 #include <linux/sched/stat.h>
38 #include <linux/cpumask.h>
39 #include <linux/smp.h>
40 #include <linux/anon_inodes.h>
41 #include <linux/profile.h>
42 #include <linux/kvm_para.h>
43 #include <linux/pagemap.h>
44 #include <linux/mman.h>
45 #include <linux/swap.h>
46 #include <linux/bitops.h>
47 #include <linux/spinlock.h>
48 #include <linux/compat.h>
49 #include <linux/srcu.h>
50 #include <linux/hugetlb.h>
51 #include <linux/slab.h>
52 #include <linux/sort.h>
53 #include <linux/bsearch.h>
54 #include <linux/kthread.h>
55 #include <linux/io.h>
56
57 #include <asm/processor.h>
58 #include <asm/ioctl.h>
59 #include <linux/uaccess.h>
60 #include <asm/pgtable.h>
61
62 #include "coalesced_mmio.h"
63 #include "async_pf.h"
64 #include "vfio.h"
65
66 #define CREATE_TRACE_POINTS
67 #include <trace/events/kvm.h>
68
69 /* Worst case buffer size needed for holding an integer. */
70 #define ITOA_MAX_LEN 12
71
72 MODULE_AUTHOR("Qumranet");
73 MODULE_LICENSE("GPL");
74
75 /* Architectures should define their poll value according to the halt latency */
76 unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
77 module_param(halt_poll_ns, uint, 0644);
78 EXPORT_SYMBOL_GPL(halt_poll_ns);
79
80 /* Default doubles per-vcpu halt_poll_ns. */
81 unsigned int halt_poll_ns_grow = 2;
82 module_param(halt_poll_ns_grow, uint, 0644);
83 EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
84
85 /* Default resets per-vcpu halt_poll_ns . */
86 unsigned int halt_poll_ns_shrink;
87 module_param(halt_poll_ns_shrink, uint, 0644);
88 EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
89
90 /*
91  * Ordering of locks:
92  *
93  *      kvm->lock --> kvm->slots_lock --> kvm->irq_lock
94  */
95
96 DEFINE_MUTEX(kvm_lock);
97 static DEFINE_RAW_SPINLOCK(kvm_count_lock);
98 LIST_HEAD(vm_list);
99
100 static cpumask_var_t cpus_hardware_enabled;
101 static int kvm_usage_count;
102 static atomic_t hardware_enable_failed;
103
104 struct kmem_cache *kvm_vcpu_cache;
105 EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
106
107 static __read_mostly struct preempt_ops kvm_preempt_ops;
108
109 struct dentry *kvm_debugfs_dir;
110 EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
111
112 static int kvm_debugfs_num_entries;
113 static const struct file_operations *stat_fops_per_vm[];
114
115 static struct file_operations kvm_chardev_ops;
116
117 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
118                            unsigned long arg);
119 #ifdef CONFIG_KVM_COMPAT
120 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
121                                   unsigned long arg);
122 #define KVM_COMPAT(c)   .compat_ioctl   = (c)
123 #else
124 static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
125                                 unsigned long arg) { return -EINVAL; }
126 #define KVM_COMPAT(c)   .compat_ioctl   = kvm_no_compat_ioctl
127 #endif
128 static int hardware_enable_all(void);
129 static void hardware_disable_all(void);
130
131 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
132
133 static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
134
135 __visible bool kvm_rebooting;
136 EXPORT_SYMBOL_GPL(kvm_rebooting);
137
138 static bool largepages_enabled = true;
139
140 #define KVM_EVENT_CREATE_VM 0
141 #define KVM_EVENT_DESTROY_VM 1
142 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
143 static unsigned long long kvm_createvm_count;
144 static unsigned long long kvm_active_vms;
145
146 __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
147                                                    unsigned long start, unsigned long end)
148 {
149 }
150
151 bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
152 {
153         /*
154          * The metadata used by is_zone_device_page() to determine whether or
155          * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
156          * the device has been pinned, e.g. by get_user_pages().  WARN if the
157          * page_count() is zero to help detect bad usage of this helper.
158          */
159         if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
160                 return false;
161
162         return is_zone_device_page(pfn_to_page(pfn));
163 }
164
165 bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
166 {
167         /*
168          * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
169          * perspective they are "normal" pages, albeit with slightly different
170          * usage rules.
171          */
172         if (pfn_valid(pfn))
173                 return PageReserved(pfn_to_page(pfn)) &&
174                        !is_zero_pfn(pfn) &&
175                        !kvm_is_zone_device_pfn(pfn);
176
177         return true;
178 }
179
180 /*
181  * Switches to specified vcpu, until a matching vcpu_put()
182  */
183 void vcpu_load(struct kvm_vcpu *vcpu)
184 {
185         int cpu = get_cpu();
186         preempt_notifier_register(&vcpu->preempt_notifier);
187         kvm_arch_vcpu_load(vcpu, cpu);
188         put_cpu();
189 }
190 EXPORT_SYMBOL_GPL(vcpu_load);
191
192 void vcpu_put(struct kvm_vcpu *vcpu)
193 {
194         preempt_disable();
195         kvm_arch_vcpu_put(vcpu);
196         preempt_notifier_unregister(&vcpu->preempt_notifier);
197         preempt_enable();
198 }
199 EXPORT_SYMBOL_GPL(vcpu_put);
200
201 /* TODO: merge with kvm_arch_vcpu_should_kick */
202 static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
203 {
204         int mode = kvm_vcpu_exiting_guest_mode(vcpu);
205
206         /*
207          * We need to wait for the VCPU to reenable interrupts and get out of
208          * READING_SHADOW_PAGE_TABLES mode.
209          */
210         if (req & KVM_REQUEST_WAIT)
211                 return mode != OUTSIDE_GUEST_MODE;
212
213         /*
214          * Need to kick a running VCPU, but otherwise there is nothing to do.
215          */
216         return mode == IN_GUEST_MODE;
217 }
218
219 static void ack_flush(void *_completed)
220 {
221 }
222
223 static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait)
224 {
225         if (unlikely(!cpus))
226                 cpus = cpu_online_mask;
227
228         if (cpumask_empty(cpus))
229                 return false;
230
231         smp_call_function_many(cpus, ack_flush, NULL, wait);
232         return true;
233 }
234
235 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
236                                  unsigned long *vcpu_bitmap, cpumask_var_t tmp)
237 {
238         int i, cpu, me;
239         struct kvm_vcpu *vcpu;
240         bool called;
241
242         me = get_cpu();
243
244         kvm_for_each_vcpu(i, vcpu, kvm) {
245                 if (!test_bit(i, vcpu_bitmap))
246                         continue;
247
248                 kvm_make_request(req, vcpu);
249                 cpu = vcpu->cpu;
250
251                 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
252                         continue;
253
254                 if (tmp != NULL && cpu != -1 && cpu != me &&
255                     kvm_request_needs_ipi(vcpu, req))
256                         __cpumask_set_cpu(cpu, tmp);
257         }
258
259         called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT));
260         put_cpu();
261
262         return called;
263 }
264
265 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
266 {
267         cpumask_var_t cpus;
268         bool called;
269         static unsigned long vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]
270                 = {[0 ... BITS_TO_LONGS(KVM_MAX_VCPUS)-1] = ULONG_MAX};
271
272         zalloc_cpumask_var(&cpus, GFP_ATOMIC);
273
274         called = kvm_make_vcpus_request_mask(kvm, req, vcpu_bitmap, cpus);
275
276         free_cpumask_var(cpus);
277         return called;
278 }
279
280 #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
281 void kvm_flush_remote_tlbs(struct kvm *kvm)
282 {
283         /*
284          * Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in
285          * kvm_make_all_cpus_request.
286          */
287         long dirty_count = smp_load_acquire(&kvm->tlbs_dirty);
288
289         /*
290          * We want to publish modifications to the page tables before reading
291          * mode. Pairs with a memory barrier in arch-specific code.
292          * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
293          * and smp_mb in walk_shadow_page_lockless_begin/end.
294          * - powerpc: smp_mb in kvmppc_prepare_to_enter.
295          *
296          * There is already an smp_mb__after_atomic() before
297          * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
298          * barrier here.
299          */
300         if (!kvm_arch_flush_remote_tlb(kvm)
301             || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
302                 ++kvm->stat.remote_tlb_flush;
303         cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
304 }
305 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
306 #endif
307
308 void kvm_reload_remote_mmus(struct kvm *kvm)
309 {
310         kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
311 }
312
313 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
314 {
315         struct page *page;
316         int r;
317
318         mutex_init(&vcpu->mutex);
319         vcpu->cpu = -1;
320         vcpu->kvm = kvm;
321         vcpu->vcpu_id = id;
322         vcpu->pid = NULL;
323         init_swait_queue_head(&vcpu->wq);
324         kvm_async_pf_vcpu_init(vcpu);
325
326         vcpu->pre_pcpu = -1;
327         INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
328
329         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
330         if (!page) {
331                 r = -ENOMEM;
332                 goto fail;
333         }
334         vcpu->run = page_address(page);
335
336         kvm_vcpu_set_in_spin_loop(vcpu, false);
337         kvm_vcpu_set_dy_eligible(vcpu, false);
338         vcpu->preempted = false;
339
340         r = kvm_arch_vcpu_init(vcpu);
341         if (r < 0)
342                 goto fail_free_run;
343         return 0;
344
345 fail_free_run:
346         free_page((unsigned long)vcpu->run);
347 fail:
348         return r;
349 }
350 EXPORT_SYMBOL_GPL(kvm_vcpu_init);
351
352 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
353 {
354         /*
355          * no need for rcu_read_lock as VCPU_RUN is the only place that
356          * will change the vcpu->pid pointer and on uninit all file
357          * descriptors are already gone.
358          */
359         put_pid(rcu_dereference_protected(vcpu->pid, 1));
360         kvm_arch_vcpu_uninit(vcpu);
361         free_page((unsigned long)vcpu->run);
362 }
363 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
364
365 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
366 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
367 {
368         return container_of(mn, struct kvm, mmu_notifier);
369 }
370
371 static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
372                                               struct mm_struct *mm,
373                                               unsigned long start, unsigned long end)
374 {
375         struct kvm *kvm = mmu_notifier_to_kvm(mn);
376         int idx;
377
378         idx = srcu_read_lock(&kvm->srcu);
379         kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
380         srcu_read_unlock(&kvm->srcu, idx);
381 }
382
383 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
384                                         struct mm_struct *mm,
385                                         unsigned long address,
386                                         pte_t pte)
387 {
388         struct kvm *kvm = mmu_notifier_to_kvm(mn);
389         int idx;
390
391         idx = srcu_read_lock(&kvm->srcu);
392         spin_lock(&kvm->mmu_lock);
393         kvm->mmu_notifier_seq++;
394         kvm_set_spte_hva(kvm, address, pte);
395         spin_unlock(&kvm->mmu_lock);
396         srcu_read_unlock(&kvm->srcu, idx);
397 }
398
399 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
400                                                     struct mm_struct *mm,
401                                                     unsigned long start,
402                                                     unsigned long end,
403                                                     bool blockable)
404 {
405         struct kvm *kvm = mmu_notifier_to_kvm(mn);
406         int need_tlb_flush = 0, idx;
407
408         idx = srcu_read_lock(&kvm->srcu);
409         spin_lock(&kvm->mmu_lock);
410         /*
411          * The count increase must become visible at unlock time as no
412          * spte can be established without taking the mmu_lock and
413          * count is also read inside the mmu_lock critical section.
414          */
415         kvm->mmu_notifier_count++;
416         need_tlb_flush = kvm_unmap_hva_range(kvm, start, end, blockable);
417         /* we've to flush the tlb before the pages can be freed */
418         if (need_tlb_flush || kvm->tlbs_dirty)
419                 kvm_flush_remote_tlbs(kvm);
420
421         spin_unlock(&kvm->mmu_lock);
422         srcu_read_unlock(&kvm->srcu, idx);
423
424         return 0;
425 }
426
427 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
428                                                   struct mm_struct *mm,
429                                                   unsigned long start,
430                                                   unsigned long end)
431 {
432         struct kvm *kvm = mmu_notifier_to_kvm(mn);
433
434         spin_lock(&kvm->mmu_lock);
435         /*
436          * This sequence increase will notify the kvm page fault that
437          * the page that is going to be mapped in the spte could have
438          * been freed.
439          */
440         kvm->mmu_notifier_seq++;
441         smp_wmb();
442         /*
443          * The above sequence increase must be visible before the
444          * below count decrease, which is ensured by the smp_wmb above
445          * in conjunction with the smp_rmb in mmu_notifier_retry().
446          */
447         kvm->mmu_notifier_count--;
448         spin_unlock(&kvm->mmu_lock);
449
450         BUG_ON(kvm->mmu_notifier_count < 0);
451 }
452
453 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
454                                               struct mm_struct *mm,
455                                               unsigned long start,
456                                               unsigned long end)
457 {
458         struct kvm *kvm = mmu_notifier_to_kvm(mn);
459         int young, idx;
460
461         idx = srcu_read_lock(&kvm->srcu);
462         spin_lock(&kvm->mmu_lock);
463
464         young = kvm_age_hva(kvm, start, end);
465         if (young)
466                 kvm_flush_remote_tlbs(kvm);
467
468         spin_unlock(&kvm->mmu_lock);
469         srcu_read_unlock(&kvm->srcu, idx);
470
471         return young;
472 }
473
474 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
475                                         struct mm_struct *mm,
476                                         unsigned long start,
477                                         unsigned long end)
478 {
479         struct kvm *kvm = mmu_notifier_to_kvm(mn);
480         int young, idx;
481
482         idx = srcu_read_lock(&kvm->srcu);
483         spin_lock(&kvm->mmu_lock);
484         /*
485          * Even though we do not flush TLB, this will still adversely
486          * affect performance on pre-Haswell Intel EPT, where there is
487          * no EPT Access Bit to clear so that we have to tear down EPT
488          * tables instead. If we find this unacceptable, we can always
489          * add a parameter to kvm_age_hva so that it effectively doesn't
490          * do anything on clear_young.
491          *
492          * Also note that currently we never issue secondary TLB flushes
493          * from clear_young, leaving this job up to the regular system
494          * cadence. If we find this inaccurate, we might come up with a
495          * more sophisticated heuristic later.
496          */
497         young = kvm_age_hva(kvm, start, end);
498         spin_unlock(&kvm->mmu_lock);
499         srcu_read_unlock(&kvm->srcu, idx);
500
501         return young;
502 }
503
504 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
505                                        struct mm_struct *mm,
506                                        unsigned long address)
507 {
508         struct kvm *kvm = mmu_notifier_to_kvm(mn);
509         int young, idx;
510
511         idx = srcu_read_lock(&kvm->srcu);
512         spin_lock(&kvm->mmu_lock);
513         young = kvm_test_age_hva(kvm, address);
514         spin_unlock(&kvm->mmu_lock);
515         srcu_read_unlock(&kvm->srcu, idx);
516
517         return young;
518 }
519
520 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
521                                      struct mm_struct *mm)
522 {
523         struct kvm *kvm = mmu_notifier_to_kvm(mn);
524         int idx;
525
526         idx = srcu_read_lock(&kvm->srcu);
527         kvm_arch_flush_shadow_all(kvm);
528         srcu_read_unlock(&kvm->srcu, idx);
529 }
530
531 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
532         .flags                  = MMU_INVALIDATE_DOES_NOT_BLOCK,
533         .invalidate_range       = kvm_mmu_notifier_invalidate_range,
534         .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
535         .invalidate_range_end   = kvm_mmu_notifier_invalidate_range_end,
536         .clear_flush_young      = kvm_mmu_notifier_clear_flush_young,
537         .clear_young            = kvm_mmu_notifier_clear_young,
538         .test_young             = kvm_mmu_notifier_test_young,
539         .change_pte             = kvm_mmu_notifier_change_pte,
540         .release                = kvm_mmu_notifier_release,
541 };
542
543 static int kvm_init_mmu_notifier(struct kvm *kvm)
544 {
545         kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
546         return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
547 }
548
549 #else  /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
550
551 static int kvm_init_mmu_notifier(struct kvm *kvm)
552 {
553         return 0;
554 }
555
556 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
557
558 static struct kvm_memslots *kvm_alloc_memslots(void)
559 {
560         int i;
561         struct kvm_memslots *slots;
562
563         slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
564         if (!slots)
565                 return NULL;
566
567         for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
568                 slots->id_to_index[i] = slots->memslots[i].id = i;
569
570         return slots;
571 }
572
573 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
574 {
575         if (!memslot->dirty_bitmap)
576                 return;
577
578         kvfree(memslot->dirty_bitmap);
579         memslot->dirty_bitmap = NULL;
580 }
581
582 /*
583  * Free any memory in @free but not in @dont.
584  */
585 static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
586                               struct kvm_memory_slot *dont)
587 {
588         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
589                 kvm_destroy_dirty_bitmap(free);
590
591         kvm_arch_free_memslot(kvm, free, dont);
592
593         free->npages = 0;
594 }
595
596 static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
597 {
598         struct kvm_memory_slot *memslot;
599
600         if (!slots)
601                 return;
602
603         kvm_for_each_memslot(memslot, slots)
604                 kvm_free_memslot(kvm, memslot, NULL);
605
606         kvfree(slots);
607 }
608
609 static void kvm_destroy_vm_debugfs(struct kvm *kvm)
610 {
611         int i;
612
613         if (!kvm->debugfs_dentry)
614                 return;
615
616         debugfs_remove_recursive(kvm->debugfs_dentry);
617
618         if (kvm->debugfs_stat_data) {
619                 for (i = 0; i < kvm_debugfs_num_entries; i++)
620                         kfree(kvm->debugfs_stat_data[i]);
621                 kfree(kvm->debugfs_stat_data);
622         }
623 }
624
625 static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
626 {
627         char dir_name[ITOA_MAX_LEN * 2];
628         struct kvm_stat_data *stat_data;
629         struct kvm_stats_debugfs_item *p;
630
631         if (!debugfs_initialized())
632                 return 0;
633
634         snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
635         kvm->debugfs_dentry = debugfs_create_dir(dir_name, kvm_debugfs_dir);
636
637         kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
638                                          sizeof(*kvm->debugfs_stat_data),
639                                          GFP_KERNEL);
640         if (!kvm->debugfs_stat_data)
641                 return -ENOMEM;
642
643         for (p = debugfs_entries; p->name; p++) {
644                 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL);
645                 if (!stat_data)
646                         return -ENOMEM;
647
648                 stat_data->kvm = kvm;
649                 stat_data->offset = p->offset;
650                 stat_data->mode = p->mode ? p->mode : 0644;
651                 kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
652                 debugfs_create_file(p->name, stat_data->mode, kvm->debugfs_dentry,
653                                     stat_data, stat_fops_per_vm[p->kind]);
654         }
655         return 0;
656 }
657
658 /*
659  * Called after the VM is otherwise initialized, but just before adding it to
660  * the vm_list.
661  */
662 int __weak kvm_arch_post_init_vm(struct kvm *kvm)
663 {
664         return 0;
665 }
666
667 /*
668  * Called just after removing the VM from the vm_list, but before doing any
669  * other destruction.
670  */
671 void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
672 {
673 }
674
675 static struct kvm *kvm_create_vm(unsigned long type)
676 {
677         int r, i;
678         struct kvm *kvm = kvm_arch_alloc_vm();
679
680         if (!kvm)
681                 return ERR_PTR(-ENOMEM);
682
683         spin_lock_init(&kvm->mmu_lock);
684         mmgrab(current->mm);
685         kvm->mm = current->mm;
686         kvm_eventfd_init(kvm);
687         mutex_init(&kvm->lock);
688         mutex_init(&kvm->irq_lock);
689         mutex_init(&kvm->slots_lock);
690         refcount_set(&kvm->users_count, 1);
691         INIT_LIST_HEAD(&kvm->devices);
692
693         r = kvm_arch_init_vm(kvm, type);
694         if (r)
695                 goto out_err_no_disable;
696
697         r = hardware_enable_all();
698         if (r)
699                 goto out_err_no_disable;
700
701 #ifdef CONFIG_HAVE_KVM_IRQFD
702         INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
703 #endif
704
705         BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
706
707         r = -ENOMEM;
708         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
709                 struct kvm_memslots *slots = kvm_alloc_memslots();
710                 if (!slots)
711                         goto out_err_no_srcu;
712                 /*
713                  * Generations must be different for each address space.
714                  * Init kvm generation close to the maximum to easily test the
715                  * code of handling generation number wrap-around.
716                  */
717                 slots->generation = i * 2 - 150;
718                 rcu_assign_pointer(kvm->memslots[i], slots);
719         }
720
721         if (init_srcu_struct(&kvm->srcu))
722                 goto out_err_no_srcu;
723         if (init_srcu_struct(&kvm->irq_srcu))
724                 goto out_err_no_irq_srcu;
725         for (i = 0; i < KVM_NR_BUSES; i++) {
726                 rcu_assign_pointer(kvm->buses[i],
727                         kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL));
728                 if (!kvm->buses[i])
729                         goto out_err_no_mmu_notifier;
730         }
731
732         r = kvm_init_mmu_notifier(kvm);
733         if (r)
734                 goto out_err_no_mmu_notifier;
735
736         r = kvm_arch_post_init_vm(kvm);
737         if (r)
738                 goto out_err;
739
740         mutex_lock(&kvm_lock);
741         list_add(&kvm->vm_list, &vm_list);
742         mutex_unlock(&kvm_lock);
743
744         preempt_notifier_inc();
745
746         /*
747          * When the fd passed to this ioctl() is opened it pins the module,
748          * but try_module_get() also prevents getting a reference if the module
749          * is in MODULE_STATE_GOING (e.g. if someone ran "rmmod --wait").
750          */
751         if (!try_module_get(kvm_chardev_ops.owner)) {
752                 r = -ENODEV;
753                 goto out_err;
754         }
755
756         return kvm;
757
758 out_err:
759 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
760         if (kvm->mmu_notifier.ops)
761                 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
762 #endif
763 out_err_no_mmu_notifier:
764         cleanup_srcu_struct(&kvm->irq_srcu);
765 out_err_no_irq_srcu:
766         cleanup_srcu_struct(&kvm->srcu);
767 out_err_no_srcu:
768         hardware_disable_all();
769 out_err_no_disable:
770         refcount_set(&kvm->users_count, 0);
771         for (i = 0; i < KVM_NR_BUSES; i++)
772                 kfree(kvm_get_bus(kvm, i));
773         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
774                 kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
775         kvm_arch_free_vm(kvm);
776         mmdrop(current->mm);
777         return ERR_PTR(r);
778 }
779
780 static void kvm_destroy_devices(struct kvm *kvm)
781 {
782         struct kvm_device *dev, *tmp;
783
784         /*
785          * We do not need to take the kvm->lock here, because nobody else
786          * has a reference to the struct kvm at this point and therefore
787          * cannot access the devices list anyhow.
788          */
789         list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
790                 list_del(&dev->vm_node);
791                 dev->ops->destroy(dev);
792         }
793 }
794
795 static void kvm_destroy_vm(struct kvm *kvm)
796 {
797         int i;
798         struct mm_struct *mm = kvm->mm;
799
800         kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
801         kvm_destroy_vm_debugfs(kvm);
802         kvm_arch_sync_events(kvm);
803         mutex_lock(&kvm_lock);
804         list_del(&kvm->vm_list);
805         mutex_unlock(&kvm_lock);
806         kvm_arch_pre_destroy_vm(kvm);
807
808         kvm_free_irq_routing(kvm);
809         for (i = 0; i < KVM_NR_BUSES; i++) {
810                 struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
811
812                 if (bus)
813                         kvm_io_bus_destroy(bus);
814                 kvm->buses[i] = NULL;
815         }
816         kvm_coalesced_mmio_free(kvm);
817 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
818         mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
819 #else
820         kvm_arch_flush_shadow_all(kvm);
821 #endif
822         kvm_arch_destroy_vm(kvm);
823         kvm_destroy_devices(kvm);
824         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
825                 kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
826         cleanup_srcu_struct(&kvm->irq_srcu);
827         cleanup_srcu_struct(&kvm->srcu);
828         kvm_arch_free_vm(kvm);
829         preempt_notifier_dec();
830         hardware_disable_all();
831         mmdrop(mm);
832         module_put(kvm_chardev_ops.owner);
833 }
834
835 void kvm_get_kvm(struct kvm *kvm)
836 {
837         refcount_inc(&kvm->users_count);
838 }
839 EXPORT_SYMBOL_GPL(kvm_get_kvm);
840
841 void kvm_put_kvm(struct kvm *kvm)
842 {
843         if (refcount_dec_and_test(&kvm->users_count))
844                 kvm_destroy_vm(kvm);
845 }
846 EXPORT_SYMBOL_GPL(kvm_put_kvm);
847
848
849 static int kvm_vm_release(struct inode *inode, struct file *filp)
850 {
851         struct kvm *kvm = filp->private_data;
852
853         kvm_irqfd_release(kvm);
854
855         kvm_put_kvm(kvm);
856         return 0;
857 }
858
859 /*
860  * Allocation size is twice as large as the actual dirty bitmap size.
861  * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed.
862  */
863 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
864 {
865         unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
866
867         memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL);
868         if (!memslot->dirty_bitmap)
869                 return -ENOMEM;
870
871         return 0;
872 }
873
874 /*
875  * Insert memslot and re-sort memslots based on their GFN,
876  * so binary search could be used to lookup GFN.
877  * Sorting algorithm takes advantage of having initially
878  * sorted array and known changed memslot position.
879  */
880 static void update_memslots(struct kvm_memslots *slots,
881                             struct kvm_memory_slot *new)
882 {
883         int id = new->id;
884         int i = slots->id_to_index[id];
885         struct kvm_memory_slot *mslots = slots->memslots;
886
887         WARN_ON(mslots[i].id != id);
888         if (!new->npages) {
889                 WARN_ON(!mslots[i].npages);
890                 if (mslots[i].npages)
891                         slots->used_slots--;
892         } else {
893                 if (!mslots[i].npages)
894                         slots->used_slots++;
895         }
896
897         while (i < KVM_MEM_SLOTS_NUM - 1 &&
898                new->base_gfn <= mslots[i + 1].base_gfn) {
899                 if (!mslots[i + 1].npages)
900                         break;
901                 mslots[i] = mslots[i + 1];
902                 slots->id_to_index[mslots[i].id] = i;
903                 i++;
904         }
905
906         /*
907          * The ">=" is needed when creating a slot with base_gfn == 0,
908          * so that it moves before all those with base_gfn == npages == 0.
909          *
910          * On the other hand, if new->npages is zero, the above loop has
911          * already left i pointing to the beginning of the empty part of
912          * mslots, and the ">=" would move the hole backwards in this
913          * case---which is wrong.  So skip the loop when deleting a slot.
914          */
915         if (new->npages) {
916                 while (i > 0 &&
917                        new->base_gfn >= mslots[i - 1].base_gfn) {
918                         mslots[i] = mslots[i - 1];
919                         slots->id_to_index[mslots[i].id] = i;
920                         i--;
921                 }
922         } else
923                 WARN_ON_ONCE(i != slots->used_slots);
924
925         mslots[i] = *new;
926         slots->id_to_index[mslots[i].id] = i;
927 }
928
929 static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
930 {
931         u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
932
933 #ifdef __KVM_HAVE_READONLY_MEM
934         valid_flags |= KVM_MEM_READONLY;
935 #endif
936
937         if (mem->flags & ~valid_flags)
938                 return -EINVAL;
939
940         return 0;
941 }
942
943 static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
944                 int as_id, struct kvm_memslots *slots)
945 {
946         struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
947         u64 gen;
948
949         /*
950          * Set the low bit in the generation, which disables SPTE caching
951          * until the end of synchronize_srcu_expedited.
952          */
953         WARN_ON(old_memslots->generation & 1);
954         slots->generation = old_memslots->generation + 1;
955
956         rcu_assign_pointer(kvm->memslots[as_id], slots);
957         synchronize_srcu_expedited(&kvm->srcu);
958
959         /*
960          * Increment the new memslot generation a second time. This prevents
961          * vm exits that race with memslot updates from caching a memslot
962          * generation that will (potentially) be valid forever.
963          *
964          * Generations must be unique even across address spaces.  We do not need
965          * a global counter for that, instead the generation space is evenly split
966          * across address spaces.  For example, with two address spaces, address
967          * space 0 will use generations 0, 4, 8, ... while * address space 1 will
968          * use generations 2, 6, 10, 14, ...
969          */
970         gen = slots->generation + KVM_ADDRESS_SPACE_NUM * 2 - 1;
971
972         kvm_arch_memslots_updated(kvm, gen);
973
974         slots->generation = gen;
975
976         return old_memslots;
977 }
978
979 /*
980  * Allocate some memory and give it an address in the guest physical address
981  * space.
982  *
983  * Discontiguous memory is allowed, mostly for framebuffers.
984  *
985  * Must be called holding kvm->slots_lock for write.
986  */
987 int __kvm_set_memory_region(struct kvm *kvm,
988                             const struct kvm_userspace_memory_region *mem)
989 {
990         int r;
991         gfn_t base_gfn;
992         unsigned long npages;
993         struct kvm_memory_slot *slot;
994         struct kvm_memory_slot old, new;
995         struct kvm_memslots *slots = NULL, *old_memslots;
996         int as_id, id;
997         enum kvm_mr_change change;
998
999         r = check_memory_region_flags(mem);
1000         if (r)
1001                 goto out;
1002
1003         r = -EINVAL;
1004         as_id = mem->slot >> 16;
1005         id = (u16)mem->slot;
1006
1007         /* General sanity checks */
1008         if (mem->memory_size & (PAGE_SIZE - 1))
1009                 goto out;
1010         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1011                 goto out;
1012         /* We can read the guest memory with __xxx_user() later on. */
1013         if ((id < KVM_USER_MEM_SLOTS) &&
1014             ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
1015              !access_ok(VERIFY_WRITE,
1016                         (void __user *)(unsigned long)mem->userspace_addr,
1017                         mem->memory_size)))
1018                 goto out;
1019         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
1020                 goto out;
1021         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1022                 goto out;
1023
1024         slot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
1025         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1026         npages = mem->memory_size >> PAGE_SHIFT;
1027
1028         if (npages > KVM_MEM_MAX_NR_PAGES)
1029                 goto out;
1030
1031         new = old = *slot;
1032
1033         new.id = id;
1034         new.base_gfn = base_gfn;
1035         new.npages = npages;
1036         new.flags = mem->flags;
1037
1038         if (npages) {
1039                 if (!old.npages)
1040                         change = KVM_MR_CREATE;
1041                 else { /* Modify an existing slot. */
1042                         if ((mem->userspace_addr != old.userspace_addr) ||
1043                             (npages != old.npages) ||
1044                             ((new.flags ^ old.flags) & KVM_MEM_READONLY))
1045                                 goto out;
1046
1047                         if (base_gfn != old.base_gfn)
1048                                 change = KVM_MR_MOVE;
1049                         else if (new.flags != old.flags)
1050                                 change = KVM_MR_FLAGS_ONLY;
1051                         else { /* Nothing to change. */
1052                                 r = 0;
1053                                 goto out;
1054                         }
1055                 }
1056         } else {
1057                 if (!old.npages)
1058                         goto out;
1059
1060                 change = KVM_MR_DELETE;
1061                 new.base_gfn = 0;
1062                 new.flags = 0;
1063         }
1064
1065         if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
1066                 /* Check for overlaps */
1067                 r = -EEXIST;
1068                 kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) {
1069                         if (slot->id == id)
1070                                 continue;
1071                         if (!((base_gfn + npages <= slot->base_gfn) ||
1072                               (base_gfn >= slot->base_gfn + slot->npages)))
1073                                 goto out;
1074                 }
1075         }
1076
1077         /* Free page dirty bitmap if unneeded */
1078         if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
1079                 new.dirty_bitmap = NULL;
1080
1081         r = -ENOMEM;
1082         if (change == KVM_MR_CREATE) {
1083                 new.userspace_addr = mem->userspace_addr;
1084
1085                 if (kvm_arch_create_memslot(kvm, &new, npages))
1086                         goto out_free;
1087         }
1088
1089         /* Allocate page dirty bitmap if needed */
1090         if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
1091                 if (kvm_create_dirty_bitmap(&new) < 0)
1092                         goto out_free;
1093         }
1094
1095         slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
1096         if (!slots)
1097                 goto out_free;
1098         memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots));
1099
1100         if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) {
1101                 slot = id_to_memslot(slots, id);
1102                 slot->flags |= KVM_MEMSLOT_INVALID;
1103
1104                 old_memslots = install_new_memslots(kvm, as_id, slots);
1105
1106                 /* From this point no new shadow pages pointing to a deleted,
1107                  * or moved, memslot will be created.
1108                  *
1109                  * validation of sp->gfn happens in:
1110                  *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1111                  *      - kvm_is_visible_gfn (mmu_check_roots)
1112                  */
1113                 kvm_arch_flush_shadow_memslot(kvm, slot);
1114
1115                 /*
1116                  * We can re-use the old_memslots from above, the only difference
1117                  * from the currently installed memslots is the invalid flag.  This
1118                  * will get overwritten by update_memslots anyway.
1119                  */
1120                 slots = old_memslots;
1121         }
1122
1123         r = kvm_arch_prepare_memory_region(kvm, &new, mem, change);
1124         if (r)
1125                 goto out_slots;
1126
1127         /* actual memory is freed via old in kvm_free_memslot below */
1128         if (change == KVM_MR_DELETE) {
1129                 new.dirty_bitmap = NULL;
1130                 memset(&new.arch, 0, sizeof(new.arch));
1131         }
1132
1133         update_memslots(slots, &new);
1134         old_memslots = install_new_memslots(kvm, as_id, slots);
1135
1136         kvm_arch_commit_memory_region(kvm, mem, &old, &new, change);
1137
1138         kvm_free_memslot(kvm, &old, &new);
1139         kvfree(old_memslots);
1140         return 0;
1141
1142 out_slots:
1143         kvfree(slots);
1144 out_free:
1145         kvm_free_memslot(kvm, &new, &old);
1146 out:
1147         return r;
1148 }
1149 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
1150
1151 int kvm_set_memory_region(struct kvm *kvm,
1152                           const struct kvm_userspace_memory_region *mem)
1153 {
1154         int r;
1155
1156         mutex_lock(&kvm->slots_lock);
1157         r = __kvm_set_memory_region(kvm, mem);
1158         mutex_unlock(&kvm->slots_lock);
1159         return r;
1160 }
1161 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
1162
1163 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
1164                                           struct kvm_userspace_memory_region *mem)
1165 {
1166         if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
1167                 return -EINVAL;
1168
1169         return kvm_set_memory_region(kvm, mem);
1170 }
1171
1172 int kvm_get_dirty_log(struct kvm *kvm,
1173                         struct kvm_dirty_log *log, int *is_dirty)
1174 {
1175         struct kvm_memslots *slots;
1176         struct kvm_memory_slot *memslot;
1177         int i, as_id, id;
1178         unsigned long n;
1179         unsigned long any = 0;
1180
1181         as_id = log->slot >> 16;
1182         id = (u16)log->slot;
1183         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1184                 return -EINVAL;
1185
1186         slots = __kvm_memslots(kvm, as_id);
1187         memslot = id_to_memslot(slots, id);
1188         if (!memslot->dirty_bitmap)
1189                 return -ENOENT;
1190
1191         n = kvm_dirty_bitmap_bytes(memslot);
1192
1193         for (i = 0; !any && i < n/sizeof(long); ++i)
1194                 any = memslot->dirty_bitmap[i];
1195
1196         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
1197                 return -EFAULT;
1198
1199         if (any)
1200                 *is_dirty = 1;
1201         return 0;
1202 }
1203 EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
1204
1205 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
1206 /**
1207  * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages
1208  *      are dirty write protect them for next write.
1209  * @kvm:        pointer to kvm instance
1210  * @log:        slot id and address to which we copy the log
1211  * @is_dirty:   flag set if any page is dirty
1212  *
1213  * We need to keep it in mind that VCPU threads can write to the bitmap
1214  * concurrently. So, to avoid losing track of dirty pages we keep the
1215  * following order:
1216  *
1217  *    1. Take a snapshot of the bit and clear it if needed.
1218  *    2. Write protect the corresponding page.
1219  *    3. Copy the snapshot to the userspace.
1220  *    4. Upon return caller flushes TLB's if needed.
1221  *
1222  * Between 2 and 4, the guest may write to the page using the remaining TLB
1223  * entry.  This is not a problem because the page is reported dirty using
1224  * the snapshot taken before and step 4 ensures that writes done after
1225  * exiting to userspace will be logged for the next call.
1226  *
1227  */
1228 int kvm_get_dirty_log_protect(struct kvm *kvm,
1229                         struct kvm_dirty_log *log, bool *is_dirty)
1230 {
1231         struct kvm_memslots *slots;
1232         struct kvm_memory_slot *memslot;
1233         int i, as_id, id;
1234         unsigned long n;
1235         unsigned long *dirty_bitmap;
1236         unsigned long *dirty_bitmap_buffer;
1237
1238         as_id = log->slot >> 16;
1239         id = (u16)log->slot;
1240         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1241                 return -EINVAL;
1242
1243         slots = __kvm_memslots(kvm, as_id);
1244         memslot = id_to_memslot(slots, id);
1245
1246         dirty_bitmap = memslot->dirty_bitmap;
1247         if (!dirty_bitmap)
1248                 return -ENOENT;
1249
1250         n = kvm_dirty_bitmap_bytes(memslot);
1251
1252         dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1253         memset(dirty_bitmap_buffer, 0, n);
1254
1255         spin_lock(&kvm->mmu_lock);
1256         *is_dirty = false;
1257         for (i = 0; i < n / sizeof(long); i++) {
1258                 unsigned long mask;
1259                 gfn_t offset;
1260
1261                 if (!dirty_bitmap[i])
1262                         continue;
1263
1264                 *is_dirty = true;
1265
1266                 mask = xchg(&dirty_bitmap[i], 0);
1267                 dirty_bitmap_buffer[i] = mask;
1268
1269                 if (mask) {
1270                         offset = i * BITS_PER_LONG;
1271                         kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
1272                                                                 offset, mask);
1273                 }
1274         }
1275
1276         spin_unlock(&kvm->mmu_lock);
1277         if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
1278                 return -EFAULT;
1279         return 0;
1280 }
1281 EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
1282 #endif
1283
1284 bool kvm_largepages_enabled(void)
1285 {
1286         return largepages_enabled;
1287 }
1288
1289 void kvm_disable_largepages(void)
1290 {
1291         largepages_enabled = false;
1292 }
1293 EXPORT_SYMBOL_GPL(kvm_disable_largepages);
1294
1295 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
1296 {
1297         return __gfn_to_memslot(kvm_memslots(kvm), gfn);
1298 }
1299 EXPORT_SYMBOL_GPL(gfn_to_memslot);
1300
1301 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
1302 {
1303         return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn);
1304 }
1305
1306 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
1307 {
1308         struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
1309
1310         if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS ||
1311               memslot->flags & KVM_MEMSLOT_INVALID)
1312                 return false;
1313
1314         return true;
1315 }
1316 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
1317
1318 unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
1319 {
1320         struct vm_area_struct *vma;
1321         unsigned long addr, size;
1322
1323         size = PAGE_SIZE;
1324
1325         addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
1326         if (kvm_is_error_hva(addr))
1327                 return PAGE_SIZE;
1328
1329         down_read(&current->mm->mmap_sem);
1330         vma = find_vma(current->mm, addr);
1331         if (!vma)
1332                 goto out;
1333
1334         size = vma_kernel_pagesize(vma);
1335
1336 out:
1337         up_read(&current->mm->mmap_sem);
1338
1339         return size;
1340 }
1341
1342 static bool memslot_is_readonly(struct kvm_memory_slot *slot)
1343 {
1344         return slot->flags & KVM_MEM_READONLY;
1345 }
1346
1347 static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
1348                                        gfn_t *nr_pages, bool write)
1349 {
1350         if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
1351                 return KVM_HVA_ERR_BAD;
1352
1353         if (memslot_is_readonly(slot) && write)
1354                 return KVM_HVA_ERR_RO_BAD;
1355
1356         if (nr_pages)
1357                 *nr_pages = slot->npages - (gfn - slot->base_gfn);
1358
1359         return __gfn_to_hva_memslot(slot, gfn);
1360 }
1361
1362 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
1363                                      gfn_t *nr_pages)
1364 {
1365         return __gfn_to_hva_many(slot, gfn, nr_pages, true);
1366 }
1367
1368 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
1369                                         gfn_t gfn)
1370 {
1371         return gfn_to_hva_many(slot, gfn, NULL);
1372 }
1373 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
1374
1375 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
1376 {
1377         return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
1378 }
1379 EXPORT_SYMBOL_GPL(gfn_to_hva);
1380
1381 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
1382 {
1383         return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
1384 }
1385 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
1386
1387 /*
1388  * If writable is set to false, the hva returned by this function is only
1389  * allowed to be read.
1390  */
1391 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
1392                                       gfn_t gfn, bool *writable)
1393 {
1394         unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
1395
1396         if (!kvm_is_error_hva(hva) && writable)
1397                 *writable = !memslot_is_readonly(slot);
1398
1399         return hva;
1400 }
1401
1402 unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
1403 {
1404         struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
1405
1406         return gfn_to_hva_memslot_prot(slot, gfn, writable);
1407 }
1408
1409 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
1410 {
1411         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1412
1413         return gfn_to_hva_memslot_prot(slot, gfn, writable);
1414 }
1415
1416 static inline int check_user_page_hwpoison(unsigned long addr)
1417 {
1418         int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
1419
1420         rc = get_user_pages(addr, 1, flags, NULL, NULL);
1421         return rc == -EHWPOISON;
1422 }
1423
1424 /*
1425  * The fast path to get the writable pfn which will be stored in @pfn,
1426  * true indicates success, otherwise false is returned.  It's also the
1427  * only part that runs if we can are in atomic context.
1428  */
1429 static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
1430                             bool *writable, kvm_pfn_t *pfn)
1431 {
1432         struct page *page[1];
1433         int npages;
1434
1435         /*
1436          * Fast pin a writable pfn only if it is a write fault request
1437          * or the caller allows to map a writable pfn for a read fault
1438          * request.
1439          */
1440         if (!(write_fault || writable))
1441                 return false;
1442
1443         npages = __get_user_pages_fast(addr, 1, 1, page);
1444         if (npages == 1) {
1445                 *pfn = page_to_pfn(page[0]);
1446
1447                 if (writable)
1448                         *writable = true;
1449                 return true;
1450         }
1451
1452         return false;
1453 }
1454
1455 /*
1456  * The slow path to get the pfn of the specified host virtual address,
1457  * 1 indicates success, -errno is returned if error is detected.
1458  */
1459 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
1460                            bool *writable, kvm_pfn_t *pfn)
1461 {
1462         unsigned int flags = FOLL_HWPOISON;
1463         struct page *page;
1464         int npages = 0;
1465
1466         might_sleep();
1467
1468         if (writable)
1469                 *writable = write_fault;
1470
1471         if (write_fault)
1472                 flags |= FOLL_WRITE;
1473         if (async)
1474                 flags |= FOLL_NOWAIT;
1475
1476         npages = get_user_pages_unlocked(addr, 1, &page, flags);
1477         if (npages != 1)
1478                 return npages;
1479
1480         /* map read fault as writable if possible */
1481         if (unlikely(!write_fault) && writable) {
1482                 struct page *wpage;
1483
1484                 if (__get_user_pages_fast(addr, 1, 1, &wpage) == 1) {
1485                         *writable = true;
1486                         put_page(page);
1487                         page = wpage;
1488                 }
1489         }
1490         *pfn = page_to_pfn(page);
1491         return npages;
1492 }
1493
1494 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
1495 {
1496         if (unlikely(!(vma->vm_flags & VM_READ)))
1497                 return false;
1498
1499         if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
1500                 return false;
1501
1502         return true;
1503 }
1504
1505 static int kvm_try_get_pfn(kvm_pfn_t pfn)
1506 {
1507         if (kvm_is_reserved_pfn(pfn))
1508                 return 1;
1509         return get_page_unless_zero(pfn_to_page(pfn));
1510 }
1511
1512 static int hva_to_pfn_remapped(struct vm_area_struct *vma,
1513                                unsigned long addr, bool *async,
1514                                bool write_fault, bool *writable,
1515                                kvm_pfn_t *p_pfn)
1516 {
1517         kvm_pfn_t pfn;
1518         pte_t *ptep;
1519         spinlock_t *ptl;
1520         int r;
1521
1522         r = follow_pte_pmd(vma->vm_mm, addr, NULL, NULL, &ptep, NULL, &ptl);
1523         if (r) {
1524                 /*
1525                  * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
1526                  * not call the fault handler, so do it here.
1527                  */
1528                 bool unlocked = false;
1529                 r = fixup_user_fault(current, current->mm, addr,
1530                                      (write_fault ? FAULT_FLAG_WRITE : 0),
1531                                      &unlocked);
1532                 if (unlocked)
1533                         return -EAGAIN;
1534                 if (r)
1535                         return r;
1536
1537                 r = follow_pte_pmd(vma->vm_mm, addr, NULL, NULL, &ptep, NULL, &ptl);
1538                 if (r)
1539                         return r;
1540         }
1541
1542         if (write_fault && !pte_write(*ptep)) {
1543                 pfn = KVM_PFN_ERR_RO_FAULT;
1544                 goto out;
1545         }
1546
1547         if (writable)
1548                 *writable = pte_write(*ptep);
1549         pfn = pte_pfn(*ptep);
1550
1551         /*
1552          * Get a reference here because callers of *hva_to_pfn* and
1553          * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
1554          * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
1555          * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will
1556          * simply do nothing for reserved pfns.
1557          *
1558          * Whoever called remap_pfn_range is also going to call e.g.
1559          * unmap_mapping_range before the underlying pages are freed,
1560          * causing a call to our MMU notifier.
1561          *
1562          * Certain IO or PFNMAP mappings can be backed with valid
1563          * struct pages, but be allocated without refcounting e.g.,
1564          * tail pages of non-compound higher order allocations, which
1565          * would then underflow the refcount when the caller does the
1566          * required put_page. Don't allow those pages here.
1567          */ 
1568         if (!kvm_try_get_pfn(pfn))
1569                 r = -EFAULT;
1570
1571 out:
1572         pte_unmap_unlock(ptep, ptl);
1573         *p_pfn = pfn;
1574
1575         return r;
1576 }
1577
1578 /*
1579  * Pin guest page in memory and return its pfn.
1580  * @addr: host virtual address which maps memory to the guest
1581  * @atomic: whether this function can sleep
1582  * @async: whether this function need to wait IO complete if the
1583  *         host page is not in the memory
1584  * @write_fault: whether we should get a writable host page
1585  * @writable: whether it allows to map a writable host page for !@write_fault
1586  *
1587  * The function will map a writable host page for these two cases:
1588  * 1): @write_fault = true
1589  * 2): @write_fault = false && @writable, @writable will tell the caller
1590  *     whether the mapping is writable.
1591  */
1592 static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
1593                         bool write_fault, bool *writable)
1594 {
1595         struct vm_area_struct *vma;
1596         kvm_pfn_t pfn = 0;
1597         int npages, r;
1598
1599         /* we can do it either atomically or asynchronously, not both */
1600         BUG_ON(atomic && async);
1601
1602         if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
1603                 return pfn;
1604
1605         if (atomic)
1606                 return KVM_PFN_ERR_FAULT;
1607
1608         npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
1609         if (npages == 1)
1610                 return pfn;
1611
1612         down_read(&current->mm->mmap_sem);
1613         if (npages == -EHWPOISON ||
1614               (!async && check_user_page_hwpoison(addr))) {
1615                 pfn = KVM_PFN_ERR_HWPOISON;
1616                 goto exit;
1617         }
1618
1619 retry:
1620         vma = find_vma_intersection(current->mm, addr, addr + 1);
1621
1622         if (vma == NULL)
1623                 pfn = KVM_PFN_ERR_FAULT;
1624         else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
1625                 r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn);
1626                 if (r == -EAGAIN)
1627                         goto retry;
1628                 if (r < 0)
1629                         pfn = KVM_PFN_ERR_FAULT;
1630         } else {
1631                 if (async && vma_is_valid(vma, write_fault))
1632                         *async = true;
1633                 pfn = KVM_PFN_ERR_FAULT;
1634         }
1635 exit:
1636         up_read(&current->mm->mmap_sem);
1637         return pfn;
1638 }
1639
1640 kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
1641                                bool atomic, bool *async, bool write_fault,
1642                                bool *writable)
1643 {
1644         unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
1645
1646         if (addr == KVM_HVA_ERR_RO_BAD) {
1647                 if (writable)
1648                         *writable = false;
1649                 return KVM_PFN_ERR_RO_FAULT;
1650         }
1651
1652         if (kvm_is_error_hva(addr)) {
1653                 if (writable)
1654                         *writable = false;
1655                 return KVM_PFN_NOSLOT;
1656         }
1657
1658         /* Do not map writable pfn in the readonly memslot. */
1659         if (writable && memslot_is_readonly(slot)) {
1660                 *writable = false;
1661                 writable = NULL;
1662         }
1663
1664         return hva_to_pfn(addr, atomic, async, write_fault,
1665                           writable);
1666 }
1667 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
1668
1669 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
1670                       bool *writable)
1671 {
1672         return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
1673                                     write_fault, writable);
1674 }
1675 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
1676
1677 kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
1678 {
1679         return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
1680 }
1681 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
1682
1683 kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
1684 {
1685         return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
1686 }
1687 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
1688
1689 kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
1690 {
1691         return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn);
1692 }
1693 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
1694
1695 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
1696 {
1697         return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
1698 }
1699 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
1700
1701 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
1702 {
1703         return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
1704 }
1705 EXPORT_SYMBOL_GPL(gfn_to_pfn);
1706
1707 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
1708 {
1709         return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
1710 }
1711 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
1712
1713 int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
1714                             struct page **pages, int nr_pages)
1715 {
1716         unsigned long addr;
1717         gfn_t entry = 0;
1718
1719         addr = gfn_to_hva_many(slot, gfn, &entry);
1720         if (kvm_is_error_hva(addr))
1721                 return -1;
1722
1723         if (entry < nr_pages)
1724                 return 0;
1725
1726         return __get_user_pages_fast(addr, nr_pages, 1, pages);
1727 }
1728 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
1729
1730 static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
1731 {
1732         if (is_error_noslot_pfn(pfn))
1733                 return KVM_ERR_PTR_BAD_PAGE;
1734
1735         if (kvm_is_reserved_pfn(pfn)) {
1736                 WARN_ON(1);
1737                 return KVM_ERR_PTR_BAD_PAGE;
1738         }
1739
1740         return pfn_to_page(pfn);
1741 }
1742
1743 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
1744 {
1745         kvm_pfn_t pfn;
1746
1747         pfn = gfn_to_pfn(kvm, gfn);
1748
1749         return kvm_pfn_to_page(pfn);
1750 }
1751 EXPORT_SYMBOL_GPL(gfn_to_page);
1752
1753 void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache)
1754 {
1755         if (pfn == 0)
1756                 return;
1757
1758         if (cache)
1759                 cache->pfn = cache->gfn = 0;
1760
1761         if (dirty)
1762                 kvm_release_pfn_dirty(pfn);
1763         else
1764                 kvm_release_pfn_clean(pfn);
1765 }
1766
1767 static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn,
1768                                  struct gfn_to_pfn_cache *cache, u64 gen)
1769 {
1770         kvm_release_pfn(cache->pfn, cache->dirty, cache);
1771
1772         cache->pfn = gfn_to_pfn_memslot(slot, gfn);
1773         cache->gfn = gfn;
1774         cache->dirty = false;
1775         cache->generation = gen;
1776 }
1777
1778 static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn,
1779                          struct kvm_host_map *map,
1780                          struct gfn_to_pfn_cache *cache,
1781                          bool atomic)
1782 {
1783         kvm_pfn_t pfn;
1784         void *hva = NULL;
1785         struct page *page = KVM_UNMAPPED_PAGE;
1786         struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn);
1787         u64 gen = slots->generation;
1788
1789         if (!map)
1790                 return -EINVAL;
1791
1792         if (cache) {
1793                 if (!cache->pfn || cache->gfn != gfn ||
1794                         cache->generation != gen) {
1795                         if (atomic)
1796                                 return -EAGAIN;
1797                         kvm_cache_gfn_to_pfn(slot, gfn, cache, gen);
1798                 }
1799                 pfn = cache->pfn;
1800         } else {
1801                 if (atomic)
1802                         return -EAGAIN;
1803                 pfn = gfn_to_pfn_memslot(slot, gfn);
1804         }
1805         if (is_error_noslot_pfn(pfn))
1806                 return -EINVAL;
1807
1808         if (pfn_valid(pfn)) {
1809                 page = pfn_to_page(pfn);
1810                 if (atomic)
1811                         hva = kmap_atomic(page);
1812                 else
1813                         hva = kmap(page);
1814 #ifdef CONFIG_HAS_IOMEM
1815         } else if (!atomic) {
1816                 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
1817         } else {
1818                 return -EINVAL;
1819 #endif
1820         }
1821
1822         if (!hva)
1823                 return -EFAULT;
1824
1825         map->page = page;
1826         map->hva = hva;
1827         map->pfn = pfn;
1828         map->gfn = gfn;
1829
1830         return 0;
1831 }
1832
1833 int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
1834                 struct gfn_to_pfn_cache *cache, bool atomic)
1835 {
1836         return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map,
1837                         cache, atomic);
1838 }
1839 EXPORT_SYMBOL_GPL(kvm_map_gfn);
1840
1841 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
1842 {
1843         return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map,
1844                 NULL, false);
1845 }
1846 EXPORT_SYMBOL_GPL(kvm_vcpu_map);
1847
1848 static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot,
1849                         struct kvm_host_map *map,
1850                         struct gfn_to_pfn_cache *cache,
1851                         bool dirty, bool atomic)
1852 {
1853         if (!map)
1854                 return;
1855
1856         if (!map->hva)
1857                 return;
1858
1859         if (map->page != KVM_UNMAPPED_PAGE) {
1860                 if (atomic)
1861                         kunmap_atomic(map->hva);
1862                 else
1863                         kunmap(map->page);
1864         }
1865 #ifdef CONFIG_HAS_IOMEM
1866         else if (!atomic)
1867                 memunmap(map->hva);
1868         else
1869                 WARN_ONCE(1, "Unexpected unmapping in atomic context");
1870 #endif
1871
1872         if (dirty)
1873                 mark_page_dirty_in_slot(memslot, map->gfn);
1874
1875         if (cache)
1876                 cache->dirty |= dirty;
1877         else
1878                 kvm_release_pfn(map->pfn, dirty, NULL);
1879
1880         map->hva = NULL;
1881         map->page = NULL;
1882 }
1883
1884 int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, 
1885                   struct gfn_to_pfn_cache *cache, bool dirty, bool atomic)
1886 {
1887         __kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map,
1888                         cache, dirty, atomic);
1889         return 0;
1890 }
1891 EXPORT_SYMBOL_GPL(kvm_unmap_gfn);
1892
1893 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
1894 {
1895         __kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, NULL,
1896                         dirty, false);
1897 }
1898 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
1899
1900 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
1901 {
1902         kvm_pfn_t pfn;
1903
1904         pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
1905
1906         return kvm_pfn_to_page(pfn);
1907 }
1908 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
1909
1910 void kvm_release_page_clean(struct page *page)
1911 {
1912         WARN_ON(is_error_page(page));
1913
1914         kvm_release_pfn_clean(page_to_pfn(page));
1915 }
1916 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
1917
1918 void kvm_release_pfn_clean(kvm_pfn_t pfn)
1919 {
1920         if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
1921                 put_page(pfn_to_page(pfn));
1922 }
1923 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
1924
1925 void kvm_release_page_dirty(struct page *page)
1926 {
1927         WARN_ON(is_error_page(page));
1928
1929         kvm_release_pfn_dirty(page_to_pfn(page));
1930 }
1931 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
1932
1933 void kvm_release_pfn_dirty(kvm_pfn_t pfn)
1934 {
1935         kvm_set_pfn_dirty(pfn);
1936         kvm_release_pfn_clean(pfn);
1937 }
1938 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
1939
1940 void kvm_set_pfn_dirty(kvm_pfn_t pfn)
1941 {
1942         if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) {
1943                 struct page *page = pfn_to_page(pfn);
1944
1945                 if (!PageReserved(page))
1946                         SetPageDirty(page);
1947         }
1948 }
1949 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
1950
1951 void kvm_set_pfn_accessed(kvm_pfn_t pfn)
1952 {
1953         if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
1954                 mark_page_accessed(pfn_to_page(pfn));
1955 }
1956 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
1957
1958 void kvm_get_pfn(kvm_pfn_t pfn)
1959 {
1960         if (!kvm_is_reserved_pfn(pfn))
1961                 get_page(pfn_to_page(pfn));
1962 }
1963 EXPORT_SYMBOL_GPL(kvm_get_pfn);
1964
1965 static int next_segment(unsigned long len, int offset)
1966 {
1967         if (len > PAGE_SIZE - offset)
1968                 return PAGE_SIZE - offset;
1969         else
1970                 return len;
1971 }
1972
1973 static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
1974                                  void *data, int offset, int len)
1975 {
1976         int r;
1977         unsigned long addr;
1978
1979         addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
1980         if (kvm_is_error_hva(addr))
1981                 return -EFAULT;
1982         r = __copy_from_user(data, (void __user *)addr + offset, len);
1983         if (r)
1984                 return -EFAULT;
1985         return 0;
1986 }
1987
1988 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
1989                         int len)
1990 {
1991         struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
1992
1993         return __kvm_read_guest_page(slot, gfn, data, offset, len);
1994 }
1995 EXPORT_SYMBOL_GPL(kvm_read_guest_page);
1996
1997 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
1998                              int offset, int len)
1999 {
2000         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2001
2002         return __kvm_read_guest_page(slot, gfn, data, offset, len);
2003 }
2004 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
2005
2006 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
2007 {
2008         gfn_t gfn = gpa >> PAGE_SHIFT;
2009         int seg;
2010         int offset = offset_in_page(gpa);
2011         int ret;
2012
2013         while ((seg = next_segment(len, offset)) != 0) {
2014                 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
2015                 if (ret < 0)
2016                         return ret;
2017                 offset = 0;
2018                 len -= seg;
2019                 data += seg;
2020                 ++gfn;
2021         }
2022         return 0;
2023 }
2024 EXPORT_SYMBOL_GPL(kvm_read_guest);
2025
2026 int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
2027 {
2028         gfn_t gfn = gpa >> PAGE_SHIFT;
2029         int seg;
2030         int offset = offset_in_page(gpa);
2031         int ret;
2032
2033         while ((seg = next_segment(len, offset)) != 0) {
2034                 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
2035                 if (ret < 0)
2036                         return ret;
2037                 offset = 0;
2038                 len -= seg;
2039                 data += seg;
2040                 ++gfn;
2041         }
2042         return 0;
2043 }
2044 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
2045
2046 static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2047                                    void *data, int offset, unsigned long len)
2048 {
2049         int r;
2050         unsigned long addr;
2051
2052         addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
2053         if (kvm_is_error_hva(addr))
2054                 return -EFAULT;
2055         pagefault_disable();
2056         r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
2057         pagefault_enable();
2058         if (r)
2059                 return -EFAULT;
2060         return 0;
2061 }
2062
2063 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
2064                           unsigned long len)
2065 {
2066         gfn_t gfn = gpa >> PAGE_SHIFT;
2067         struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2068         int offset = offset_in_page(gpa);
2069
2070         return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
2071 }
2072 EXPORT_SYMBOL_GPL(kvm_read_guest_atomic);
2073
2074 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
2075                                void *data, unsigned long len)
2076 {
2077         gfn_t gfn = gpa >> PAGE_SHIFT;
2078         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2079         int offset = offset_in_page(gpa);
2080
2081         return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
2082 }
2083 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
2084
2085 static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn,
2086                                   const void *data, int offset, int len)
2087 {
2088         int r;
2089         unsigned long addr;
2090
2091         addr = gfn_to_hva_memslot(memslot, gfn);
2092         if (kvm_is_error_hva(addr))
2093                 return -EFAULT;
2094         r = __copy_to_user((void __user *)addr + offset, data, len);
2095         if (r)
2096                 return -EFAULT;
2097         mark_page_dirty_in_slot(memslot, gfn);
2098         return 0;
2099 }
2100
2101 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
2102                          const void *data, int offset, int len)
2103 {
2104         struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2105
2106         return __kvm_write_guest_page(slot, gfn, data, offset, len);
2107 }
2108 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
2109
2110 int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
2111                               const void *data, int offset, int len)
2112 {
2113         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2114
2115         return __kvm_write_guest_page(slot, gfn, data, offset, len);
2116 }
2117 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
2118
2119 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
2120                     unsigned long len)
2121 {
2122         gfn_t gfn = gpa >> PAGE_SHIFT;
2123         int seg;
2124         int offset = offset_in_page(gpa);
2125         int ret;
2126
2127         while ((seg = next_segment(len, offset)) != 0) {
2128                 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
2129                 if (ret < 0)
2130                         return ret;
2131                 offset = 0;
2132                 len -= seg;
2133                 data += seg;
2134                 ++gfn;
2135         }
2136         return 0;
2137 }
2138 EXPORT_SYMBOL_GPL(kvm_write_guest);
2139
2140 int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
2141                          unsigned long len)
2142 {
2143         gfn_t gfn = gpa >> PAGE_SHIFT;
2144         int seg;
2145         int offset = offset_in_page(gpa);
2146         int ret;
2147
2148         while ((seg = next_segment(len, offset)) != 0) {
2149                 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
2150                 if (ret < 0)
2151                         return ret;
2152                 offset = 0;
2153                 len -= seg;
2154                 data += seg;
2155                 ++gfn;
2156         }
2157         return 0;
2158 }
2159 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
2160
2161 static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
2162                                        struct gfn_to_hva_cache *ghc,
2163                                        gpa_t gpa, unsigned long len)
2164 {
2165         int offset = offset_in_page(gpa);
2166         gfn_t start_gfn = gpa >> PAGE_SHIFT;
2167         gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
2168         gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
2169         gfn_t nr_pages_avail;
2170
2171         ghc->gpa = gpa;
2172         ghc->generation = slots->generation;
2173         ghc->len = len;
2174         ghc->memslot = __gfn_to_memslot(slots, start_gfn);
2175         ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, NULL);
2176         if (!kvm_is_error_hva(ghc->hva) && nr_pages_needed <= 1) {
2177                 ghc->hva += offset;
2178         } else {
2179                 /*
2180                  * If the requested region crosses two memslots, we still
2181                  * verify that the entire region is valid here.
2182                  */
2183                 while (start_gfn <= end_gfn) {
2184                         nr_pages_avail = 0;
2185                         ghc->memslot = __gfn_to_memslot(slots, start_gfn);
2186                         ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
2187                                                    &nr_pages_avail);
2188                         if (kvm_is_error_hva(ghc->hva))
2189                                 return -EFAULT;
2190                         start_gfn += nr_pages_avail;
2191                 }
2192                 /* Use the slow path for cross page reads and writes. */
2193                 ghc->memslot = NULL;
2194         }
2195         return 0;
2196 }
2197
2198 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2199                               gpa_t gpa, unsigned long len)
2200 {
2201         struct kvm_memslots *slots = kvm_memslots(kvm);
2202         return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
2203 }
2204 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
2205
2206 int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2207                                   void *data, unsigned int offset,
2208                                   unsigned long len)
2209 {
2210         struct kvm_memslots *slots = kvm_memslots(kvm);
2211         int r;
2212         gpa_t gpa = ghc->gpa + offset;
2213
2214         BUG_ON(len + offset > ghc->len);
2215
2216         if (slots->generation != ghc->generation)
2217                 __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
2218
2219         if (kvm_is_error_hva(ghc->hva))
2220                 return -EFAULT;
2221
2222         if (unlikely(!ghc->memslot))
2223                 return kvm_write_guest(kvm, gpa, data, len);
2224
2225         r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
2226         if (r)
2227                 return -EFAULT;
2228         mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT);
2229
2230         return 0;
2231 }
2232 EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
2233
2234 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2235                            void *data, unsigned long len)
2236 {
2237         return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
2238 }
2239 EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
2240
2241 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
2242                            void *data, unsigned long len)
2243 {
2244         struct kvm_memslots *slots = kvm_memslots(kvm);
2245         int r;
2246
2247         BUG_ON(len > ghc->len);
2248
2249         if (slots->generation != ghc->generation)
2250                 __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
2251
2252         if (kvm_is_error_hva(ghc->hva))
2253                 return -EFAULT;
2254
2255         if (unlikely(!ghc->memslot))
2256                 return kvm_read_guest(kvm, ghc->gpa, data, len);
2257
2258         r = __copy_from_user(data, (void __user *)ghc->hva, len);
2259         if (r)
2260                 return -EFAULT;
2261
2262         return 0;
2263 }
2264 EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
2265
2266 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
2267 {
2268         const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
2269
2270         return kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
2271 }
2272 EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
2273
2274 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
2275 {
2276         gfn_t gfn = gpa >> PAGE_SHIFT;
2277         int seg;
2278         int offset = offset_in_page(gpa);
2279         int ret;
2280
2281         while ((seg = next_segment(len, offset)) != 0) {
2282                 ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
2283                 if (ret < 0)
2284                         return ret;
2285                 offset = 0;
2286                 len -= seg;
2287                 ++gfn;
2288         }
2289         return 0;
2290 }
2291 EXPORT_SYMBOL_GPL(kvm_clear_guest);
2292
2293 static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot,
2294                                     gfn_t gfn)
2295 {
2296         if (memslot && memslot->dirty_bitmap) {
2297                 unsigned long rel_gfn = gfn - memslot->base_gfn;
2298
2299                 set_bit_le(rel_gfn, memslot->dirty_bitmap);
2300         }
2301 }
2302
2303 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
2304 {
2305         struct kvm_memory_slot *memslot;
2306
2307         memslot = gfn_to_memslot(kvm, gfn);
2308         mark_page_dirty_in_slot(memslot, gfn);
2309 }
2310 EXPORT_SYMBOL_GPL(mark_page_dirty);
2311
2312 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
2313 {
2314         struct kvm_memory_slot *memslot;
2315
2316         memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2317         mark_page_dirty_in_slot(memslot, gfn);
2318 }
2319 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
2320
2321 void kvm_sigset_activate(struct kvm_vcpu *vcpu)
2322 {
2323         if (!vcpu->sigset_active)
2324                 return;
2325
2326         /*
2327          * This does a lockless modification of ->real_blocked, which is fine
2328          * because, only current can change ->real_blocked and all readers of
2329          * ->real_blocked don't care as long ->real_blocked is always a subset
2330          * of ->blocked.
2331          */
2332         sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
2333 }
2334
2335 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
2336 {
2337         if (!vcpu->sigset_active)
2338                 return;
2339
2340         sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
2341         sigemptyset(&current->real_blocked);
2342 }
2343
2344 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
2345 {
2346         unsigned int old, val, grow;
2347
2348         old = val = vcpu->halt_poll_ns;
2349         grow = READ_ONCE(halt_poll_ns_grow);
2350         /* 10us base */
2351         if (val == 0 && grow)
2352                 val = 10000;
2353         else
2354                 val *= grow;
2355
2356         if (val > halt_poll_ns)
2357                 val = halt_poll_ns;
2358
2359         vcpu->halt_poll_ns = val;
2360         trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
2361 }
2362
2363 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
2364 {
2365         unsigned int old, val, shrink;
2366
2367         old = val = vcpu->halt_poll_ns;
2368         shrink = READ_ONCE(halt_poll_ns_shrink);
2369         if (shrink == 0)
2370                 val = 0;
2371         else
2372                 val /= shrink;
2373
2374         vcpu->halt_poll_ns = val;
2375         trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
2376 }
2377
2378 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
2379 {
2380         int ret = -EINTR;
2381         int idx = srcu_read_lock(&vcpu->kvm->srcu);
2382
2383         if (kvm_arch_vcpu_runnable(vcpu)) {
2384                 kvm_make_request(KVM_REQ_UNHALT, vcpu);
2385                 goto out;
2386         }
2387         if (kvm_cpu_has_pending_timer(vcpu))
2388                 goto out;
2389         if (signal_pending(current))
2390                 goto out;
2391
2392         ret = 0;
2393 out:
2394         srcu_read_unlock(&vcpu->kvm->srcu, idx);
2395         return ret;
2396 }
2397
2398 /*
2399  * The vCPU has executed a HLT instruction with in-kernel mode enabled.
2400  */
2401 void kvm_vcpu_block(struct kvm_vcpu *vcpu)
2402 {
2403         ktime_t start, cur;
2404         DECLARE_SWAITQUEUE(wait);
2405         bool waited = false;
2406         u64 block_ns;
2407
2408         start = cur = ktime_get();
2409         if (vcpu->halt_poll_ns) {
2410                 ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
2411
2412                 ++vcpu->stat.halt_attempted_poll;
2413                 do {
2414                         /*
2415                          * This sets KVM_REQ_UNHALT if an interrupt
2416                          * arrives.
2417                          */
2418                         if (kvm_vcpu_check_block(vcpu) < 0) {
2419                                 ++vcpu->stat.halt_successful_poll;
2420                                 if (!vcpu_valid_wakeup(vcpu))
2421                                         ++vcpu->stat.halt_poll_invalid;
2422                                 goto out;
2423                         }
2424                         cur = ktime_get();
2425                 } while (single_task_running() && ktime_before(cur, stop));
2426         }
2427
2428         kvm_arch_vcpu_blocking(vcpu);
2429
2430         for (;;) {
2431                 prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
2432
2433                 if (kvm_vcpu_check_block(vcpu) < 0)
2434                         break;
2435
2436                 waited = true;
2437                 schedule();
2438         }
2439
2440         finish_swait(&vcpu->wq, &wait);
2441         cur = ktime_get();
2442
2443         kvm_arch_vcpu_unblocking(vcpu);
2444 out:
2445         block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
2446
2447         if (!vcpu_valid_wakeup(vcpu))
2448                 shrink_halt_poll_ns(vcpu);
2449         else if (halt_poll_ns) {
2450                 if (block_ns <= vcpu->halt_poll_ns)
2451                         ;
2452                 /* we had a long block, shrink polling */
2453                 else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
2454                         shrink_halt_poll_ns(vcpu);
2455                 /* we had a short halt and our poll time is too small */
2456                 else if (vcpu->halt_poll_ns < halt_poll_ns &&
2457                         block_ns < halt_poll_ns)
2458                         grow_halt_poll_ns(vcpu);
2459         } else
2460                 vcpu->halt_poll_ns = 0;
2461
2462         trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
2463         kvm_arch_vcpu_block_finish(vcpu);
2464 }
2465 EXPORT_SYMBOL_GPL(kvm_vcpu_block);
2466
2467 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
2468 {
2469         struct swait_queue_head *wqp;
2470
2471         wqp = kvm_arch_vcpu_wq(vcpu);
2472         if (swq_has_sleeper(wqp)) {
2473                 swake_up_one(wqp);
2474                 ++vcpu->stat.halt_wakeup;
2475                 return true;
2476         }
2477
2478         return false;
2479 }
2480 EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
2481
2482 #ifndef CONFIG_S390
2483 /*
2484  * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
2485  */
2486 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
2487 {
2488         int me;
2489         int cpu = vcpu->cpu;
2490
2491         if (kvm_vcpu_wake_up(vcpu))
2492                 return;
2493
2494         me = get_cpu();
2495         if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
2496                 if (kvm_arch_vcpu_should_kick(vcpu))
2497                         smp_send_reschedule(cpu);
2498         put_cpu();
2499 }
2500 EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
2501 #endif /* !CONFIG_S390 */
2502
2503 int kvm_vcpu_yield_to(struct kvm_vcpu *target)
2504 {
2505         struct pid *pid;
2506         struct task_struct *task = NULL;
2507         int ret = 0;
2508
2509         rcu_read_lock();
2510         pid = rcu_dereference(target->pid);
2511         if (pid)
2512                 task = get_pid_task(pid, PIDTYPE_PID);
2513         rcu_read_unlock();
2514         if (!task)
2515                 return ret;
2516         ret = yield_to(task, 1);
2517         put_task_struct(task);
2518
2519         return ret;
2520 }
2521 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
2522
2523 /*
2524  * Helper that checks whether a VCPU is eligible for directed yield.
2525  * Most eligible candidate to yield is decided by following heuristics:
2526  *
2527  *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently
2528  *  (preempted lock holder), indicated by @in_spin_loop.
2529  *  Set at the beiginning and cleared at the end of interception/PLE handler.
2530  *
2531  *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
2532  *  chance last time (mostly it has become eligible now since we have probably
2533  *  yielded to lockholder in last iteration. This is done by toggling
2534  *  @dy_eligible each time a VCPU checked for eligibility.)
2535  *
2536  *  Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
2537  *  to preempted lock-holder could result in wrong VCPU selection and CPU
2538  *  burning. Giving priority for a potential lock-holder increases lock
2539  *  progress.
2540  *
2541  *  Since algorithm is based on heuristics, accessing another VCPU data without
2542  *  locking does not harm. It may result in trying to yield to  same VCPU, fail
2543  *  and continue with next VCPU and so on.
2544  */
2545 static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
2546 {
2547 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
2548         bool eligible;
2549
2550         eligible = !vcpu->spin_loop.in_spin_loop ||
2551                     vcpu->spin_loop.dy_eligible;
2552
2553         if (vcpu->spin_loop.in_spin_loop)
2554                 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
2555
2556         return eligible;
2557 #else
2558         return true;
2559 #endif
2560 }
2561
2562 /*
2563  * Unlike kvm_arch_vcpu_runnable, this function is called outside
2564  * a vcpu_load/vcpu_put pair.  However, for most architectures
2565  * kvm_arch_vcpu_runnable does not require vcpu_load.
2566  */
2567 bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
2568 {
2569         return kvm_arch_vcpu_runnable(vcpu);
2570 }
2571
2572 static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
2573 {
2574         if (kvm_arch_dy_runnable(vcpu))
2575                 return true;
2576
2577 #ifdef CONFIG_KVM_ASYNC_PF
2578         if (!list_empty_careful(&vcpu->async_pf.done))
2579                 return true;
2580 #endif
2581
2582         return false;
2583 }
2584
2585 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
2586 {
2587         struct kvm *kvm = me->kvm;
2588         struct kvm_vcpu *vcpu;
2589         int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
2590         int yielded = 0;
2591         int try = 3;
2592         int pass;
2593         int i;
2594
2595         kvm_vcpu_set_in_spin_loop(me, true);
2596         /*
2597          * We boost the priority of a VCPU that is runnable but not
2598          * currently running, because it got preempted by something
2599          * else and called schedule in __vcpu_run.  Hopefully that
2600          * VCPU is holding the lock that we need and will release it.
2601          * We approximate round-robin by starting at the last boosted VCPU.
2602          */
2603         for (pass = 0; pass < 2 && !yielded && try; pass++) {
2604                 kvm_for_each_vcpu(i, vcpu, kvm) {
2605                         if (!pass && i <= last_boosted_vcpu) {
2606                                 i = last_boosted_vcpu;
2607                                 continue;
2608                         } else if (pass && i > last_boosted_vcpu)
2609                                 break;
2610                         if (!READ_ONCE(vcpu->preempted))
2611                                 continue;
2612                         if (vcpu == me)
2613                                 continue;
2614                         if (swait_active(&vcpu->wq) && !vcpu_dy_runnable(vcpu))
2615                                 continue;
2616                         if (yield_to_kernel_mode && !kvm_arch_vcpu_in_kernel(vcpu))
2617                                 continue;
2618                         if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
2619                                 continue;
2620
2621                         yielded = kvm_vcpu_yield_to(vcpu);
2622                         if (yielded > 0) {
2623                                 kvm->last_boosted_vcpu = i;
2624                                 break;
2625                         } else if (yielded < 0) {
2626                                 try--;
2627                                 if (!try)
2628                                         break;
2629                         }
2630                 }
2631         }
2632         kvm_vcpu_set_in_spin_loop(me, false);
2633
2634         /* Ensure vcpu is not eligible during next spinloop */
2635         kvm_vcpu_set_dy_eligible(me, false);
2636 }
2637 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
2638
2639 static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
2640 {
2641         struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
2642         struct page *page;
2643
2644         if (vmf->pgoff == 0)
2645                 page = virt_to_page(vcpu->run);
2646 #ifdef CONFIG_X86
2647         else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
2648                 page = virt_to_page(vcpu->arch.pio_data);
2649 #endif
2650 #ifdef CONFIG_KVM_MMIO
2651         else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
2652                 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
2653 #endif
2654         else
2655                 return kvm_arch_vcpu_fault(vcpu, vmf);
2656         get_page(page);
2657         vmf->page = page;
2658         return 0;
2659 }
2660
2661 static const struct vm_operations_struct kvm_vcpu_vm_ops = {
2662         .fault = kvm_vcpu_fault,
2663 };
2664
2665 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
2666 {
2667         vma->vm_ops = &kvm_vcpu_vm_ops;
2668         return 0;
2669 }
2670
2671 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
2672 {
2673         struct kvm_vcpu *vcpu = filp->private_data;
2674
2675         debugfs_remove_recursive(vcpu->debugfs_dentry);
2676         kvm_put_kvm(vcpu->kvm);
2677         return 0;
2678 }
2679
2680 static struct file_operations kvm_vcpu_fops = {
2681         .release        = kvm_vcpu_release,
2682         .unlocked_ioctl = kvm_vcpu_ioctl,
2683         .mmap           = kvm_vcpu_mmap,
2684         .llseek         = noop_llseek,
2685         KVM_COMPAT(kvm_vcpu_compat_ioctl),
2686 };
2687
2688 /*
2689  * Allocates an inode for the vcpu.
2690  */
2691 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
2692 {
2693         char name[8 + 1 + ITOA_MAX_LEN + 1];
2694
2695         snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
2696         return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
2697 }
2698
2699 static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
2700 {
2701         char dir_name[ITOA_MAX_LEN * 2];
2702         int ret;
2703
2704         if (!kvm_arch_has_vcpu_debugfs())
2705                 return 0;
2706
2707         if (!debugfs_initialized())
2708                 return 0;
2709
2710         snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
2711         vcpu->debugfs_dentry = debugfs_create_dir(dir_name,
2712                                                                 vcpu->kvm->debugfs_dentry);
2713         if (!vcpu->debugfs_dentry)
2714                 return -ENOMEM;
2715
2716         ret = kvm_arch_create_vcpu_debugfs(vcpu);
2717         if (ret < 0) {
2718                 debugfs_remove_recursive(vcpu->debugfs_dentry);
2719                 return ret;
2720         }
2721
2722         return 0;
2723 }
2724
2725 /*
2726  * Creates some virtual cpus.  Good luck creating more than one.
2727  */
2728 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
2729 {
2730         int r;
2731         struct kvm_vcpu *vcpu;
2732
2733         if (id >= KVM_MAX_VCPU_ID)
2734                 return -EINVAL;
2735
2736         mutex_lock(&kvm->lock);
2737         if (kvm->created_vcpus == KVM_MAX_VCPUS) {
2738                 mutex_unlock(&kvm->lock);
2739                 return -EINVAL;
2740         }
2741
2742         kvm->created_vcpus++;
2743         mutex_unlock(&kvm->lock);
2744
2745         vcpu = kvm_arch_vcpu_create(kvm, id);
2746         if (IS_ERR(vcpu)) {
2747                 r = PTR_ERR(vcpu);
2748                 goto vcpu_decrement;
2749         }
2750
2751         preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
2752
2753         r = kvm_arch_vcpu_setup(vcpu);
2754         if (r)
2755                 goto vcpu_destroy;
2756
2757         r = kvm_create_vcpu_debugfs(vcpu);
2758         if (r)
2759                 goto vcpu_destroy;
2760
2761         mutex_lock(&kvm->lock);
2762         if (kvm_get_vcpu_by_id(kvm, id)) {
2763                 r = -EEXIST;
2764                 goto unlock_vcpu_destroy;
2765         }
2766
2767         vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
2768         BUG_ON(kvm->vcpus[vcpu->vcpu_idx]);
2769
2770         /* Now it's all set up, let userspace reach it */
2771         kvm_get_kvm(kvm);
2772         r = create_vcpu_fd(vcpu);
2773         if (r < 0) {
2774                 kvm_put_kvm(kvm);
2775                 goto unlock_vcpu_destroy;
2776         }
2777
2778         kvm->vcpus[vcpu->vcpu_idx] = vcpu;
2779
2780         /*
2781          * Pairs with smp_rmb() in kvm_get_vcpu.  Write kvm->vcpus
2782          * before kvm->online_vcpu's incremented value.
2783          */
2784         smp_wmb();
2785         atomic_inc(&kvm->online_vcpus);
2786
2787         mutex_unlock(&kvm->lock);
2788         kvm_arch_vcpu_postcreate(vcpu);
2789         return r;
2790
2791 unlock_vcpu_destroy:
2792         mutex_unlock(&kvm->lock);
2793         debugfs_remove_recursive(vcpu->debugfs_dentry);
2794 vcpu_destroy:
2795         kvm_arch_vcpu_destroy(vcpu);
2796 vcpu_decrement:
2797         mutex_lock(&kvm->lock);
2798         kvm->created_vcpus--;
2799         mutex_unlock(&kvm->lock);
2800         return r;
2801 }
2802
2803 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
2804 {
2805         if (sigset) {
2806                 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
2807                 vcpu->sigset_active = 1;
2808                 vcpu->sigset = *sigset;
2809         } else
2810                 vcpu->sigset_active = 0;
2811         return 0;
2812 }
2813
2814 static long kvm_vcpu_ioctl(struct file *filp,
2815                            unsigned int ioctl, unsigned long arg)
2816 {
2817         struct kvm_vcpu *vcpu = filp->private_data;
2818         void __user *argp = (void __user *)arg;
2819         int r;
2820         struct kvm_fpu *fpu = NULL;
2821         struct kvm_sregs *kvm_sregs = NULL;
2822
2823         if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged)
2824                 return -EIO;
2825
2826         if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
2827                 return -EINVAL;
2828
2829         /*
2830          * Some architectures have vcpu ioctls that are asynchronous to vcpu
2831          * execution; mutex_lock() would break them.
2832          */
2833         r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
2834         if (r != -ENOIOCTLCMD)
2835                 return r;
2836
2837         if (mutex_lock_killable(&vcpu->mutex))
2838                 return -EINTR;
2839         switch (ioctl) {
2840         case KVM_RUN: {
2841                 struct pid *oldpid;
2842                 r = -EINVAL;
2843                 if (arg)
2844                         goto out;
2845                 oldpid = rcu_access_pointer(vcpu->pid);
2846                 if (unlikely(oldpid != task_pid(current))) {
2847                         /* The thread running this VCPU changed. */
2848                         struct pid *newpid;
2849
2850                         r = kvm_arch_vcpu_run_pid_change(vcpu);
2851                         if (r)
2852                                 break;
2853
2854                         newpid = get_task_pid(current, PIDTYPE_PID);
2855                         rcu_assign_pointer(vcpu->pid, newpid);
2856                         if (oldpid)
2857                                 synchronize_rcu();
2858                         put_pid(oldpid);
2859                 }
2860                 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
2861                 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
2862                 break;
2863         }
2864         case KVM_GET_REGS: {
2865                 struct kvm_regs *kvm_regs;
2866
2867                 r = -ENOMEM;
2868                 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
2869                 if (!kvm_regs)
2870                         goto out;
2871                 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
2872                 if (r)
2873                         goto out_free1;
2874                 r = -EFAULT;
2875                 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
2876                         goto out_free1;
2877                 r = 0;
2878 out_free1:
2879                 kfree(kvm_regs);
2880                 break;
2881         }
2882         case KVM_SET_REGS: {
2883                 struct kvm_regs *kvm_regs;
2884
2885                 r = -ENOMEM;
2886                 kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
2887                 if (IS_ERR(kvm_regs)) {
2888                         r = PTR_ERR(kvm_regs);
2889                         goto out;
2890                 }
2891                 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
2892                 kfree(kvm_regs);
2893                 break;
2894         }
2895         case KVM_GET_SREGS: {
2896                 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
2897                 r = -ENOMEM;
2898                 if (!kvm_sregs)
2899                         goto out;
2900                 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
2901                 if (r)
2902                         goto out;
2903                 r = -EFAULT;
2904                 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
2905                         goto out;
2906                 r = 0;
2907                 break;
2908         }
2909         case KVM_SET_SREGS: {
2910                 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
2911                 if (IS_ERR(kvm_sregs)) {
2912                         r = PTR_ERR(kvm_sregs);
2913                         kvm_sregs = NULL;
2914                         goto out;
2915                 }
2916                 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
2917                 break;
2918         }
2919         case KVM_GET_MP_STATE: {
2920                 struct kvm_mp_state mp_state;
2921
2922                 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
2923                 if (r)
2924                         goto out;
2925                 r = -EFAULT;
2926                 if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
2927                         goto out;
2928                 r = 0;
2929                 break;
2930         }
2931         case KVM_SET_MP_STATE: {
2932                 struct kvm_mp_state mp_state;
2933
2934                 r = -EFAULT;
2935                 if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
2936                         goto out;
2937                 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
2938                 break;
2939         }
2940         case KVM_TRANSLATE: {
2941                 struct kvm_translation tr;
2942
2943                 r = -EFAULT;
2944                 if (copy_from_user(&tr, argp, sizeof(tr)))
2945                         goto out;
2946                 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
2947                 if (r)
2948                         goto out;
2949                 r = -EFAULT;
2950                 if (copy_to_user(argp, &tr, sizeof(tr)))
2951                         goto out;
2952                 r = 0;
2953                 break;
2954         }
2955         case KVM_SET_GUEST_DEBUG: {
2956                 struct kvm_guest_debug dbg;
2957
2958                 r = -EFAULT;
2959                 if (copy_from_user(&dbg, argp, sizeof(dbg)))
2960                         goto out;
2961                 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
2962                 break;
2963         }
2964         case KVM_SET_SIGNAL_MASK: {
2965                 struct kvm_signal_mask __user *sigmask_arg = argp;
2966                 struct kvm_signal_mask kvm_sigmask;
2967                 sigset_t sigset, *p;
2968
2969                 p = NULL;
2970                 if (argp) {
2971                         r = -EFAULT;
2972                         if (copy_from_user(&kvm_sigmask, argp,
2973                                            sizeof(kvm_sigmask)))
2974                                 goto out;
2975                         r = -EINVAL;
2976                         if (kvm_sigmask.len != sizeof(sigset))
2977                                 goto out;
2978                         r = -EFAULT;
2979                         if (copy_from_user(&sigset, sigmask_arg->sigset,
2980                                            sizeof(sigset)))
2981                                 goto out;
2982                         p = &sigset;
2983                 }
2984                 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
2985                 break;
2986         }
2987         case KVM_GET_FPU: {
2988                 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
2989                 r = -ENOMEM;
2990                 if (!fpu)
2991                         goto out;
2992                 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
2993                 if (r)
2994                         goto out;
2995                 r = -EFAULT;
2996                 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
2997                         goto out;
2998                 r = 0;
2999                 break;
3000         }
3001         case KVM_SET_FPU: {
3002                 fpu = memdup_user(argp, sizeof(*fpu));
3003                 if (IS_ERR(fpu)) {
3004                         r = PTR_ERR(fpu);
3005                         fpu = NULL;
3006                         goto out;
3007                 }
3008                 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
3009                 break;
3010         }
3011         default:
3012                 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
3013         }
3014 out:
3015         mutex_unlock(&vcpu->mutex);
3016         kfree(fpu);
3017         kfree(kvm_sregs);
3018         return r;
3019 }
3020
3021 #ifdef CONFIG_KVM_COMPAT
3022 static long kvm_vcpu_compat_ioctl(struct file *filp,
3023                                   unsigned int ioctl, unsigned long arg)
3024 {
3025         struct kvm_vcpu *vcpu = filp->private_data;
3026         void __user *argp = compat_ptr(arg);
3027         int r;
3028
3029         if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged)
3030                 return -EIO;
3031
3032         switch (ioctl) {
3033         case KVM_SET_SIGNAL_MASK: {
3034                 struct kvm_signal_mask __user *sigmask_arg = argp;
3035                 struct kvm_signal_mask kvm_sigmask;
3036                 sigset_t sigset;
3037
3038                 if (argp) {
3039                         r = -EFAULT;
3040                         if (copy_from_user(&kvm_sigmask, argp,
3041                                            sizeof(kvm_sigmask)))
3042                                 goto out;
3043                         r = -EINVAL;
3044                         if (kvm_sigmask.len != sizeof(compat_sigset_t))
3045                                 goto out;
3046                         r = -EFAULT;
3047                         if (get_compat_sigset(&sigset, (void *)sigmask_arg->sigset))
3048                                 goto out;
3049                         r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
3050                 } else
3051                         r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
3052                 break;
3053         }
3054         default:
3055                 r = kvm_vcpu_ioctl(filp, ioctl, arg);
3056         }
3057
3058 out:
3059         return r;
3060 }
3061 #endif
3062
3063 static int kvm_device_ioctl_attr(struct kvm_device *dev,
3064                                  int (*accessor)(struct kvm_device *dev,
3065                                                  struct kvm_device_attr *attr),
3066                                  unsigned long arg)
3067 {
3068         struct kvm_device_attr attr;
3069
3070         if (!accessor)
3071                 return -EPERM;
3072
3073         if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
3074                 return -EFAULT;
3075
3076         return accessor(dev, &attr);
3077 }
3078
3079 static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
3080                              unsigned long arg)
3081 {
3082         struct kvm_device *dev = filp->private_data;
3083
3084         if (dev->kvm->mm != current->mm || dev->kvm->vm_bugged)
3085                 return -EIO;
3086
3087         switch (ioctl) {
3088         case KVM_SET_DEVICE_ATTR:
3089                 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
3090         case KVM_GET_DEVICE_ATTR:
3091                 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
3092         case KVM_HAS_DEVICE_ATTR:
3093                 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
3094         default:
3095                 if (dev->ops->ioctl)
3096                         return dev->ops->ioctl(dev, ioctl, arg);
3097
3098                 return -ENOTTY;
3099         }
3100 }
3101
3102 static int kvm_device_release(struct inode *inode, struct file *filp)
3103 {
3104         struct kvm_device *dev = filp->private_data;
3105         struct kvm *kvm = dev->kvm;
3106
3107         kvm_put_kvm(kvm);
3108         return 0;
3109 }
3110
3111 static const struct file_operations kvm_device_fops = {
3112         .unlocked_ioctl = kvm_device_ioctl,
3113         .release = kvm_device_release,
3114         KVM_COMPAT(kvm_device_ioctl),
3115 };
3116
3117 struct kvm_device *kvm_device_from_filp(struct file *filp)
3118 {
3119         if (filp->f_op != &kvm_device_fops)
3120                 return NULL;
3121
3122         return filp->private_data;
3123 }
3124
3125 static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
3126 #ifdef CONFIG_KVM_MPIC
3127         [KVM_DEV_TYPE_FSL_MPIC_20]      = &kvm_mpic_ops,
3128         [KVM_DEV_TYPE_FSL_MPIC_42]      = &kvm_mpic_ops,
3129 #endif
3130 };
3131
3132 int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
3133 {
3134         if (type >= ARRAY_SIZE(kvm_device_ops_table))
3135                 return -ENOSPC;
3136
3137         if (kvm_device_ops_table[type] != NULL)
3138                 return -EEXIST;
3139
3140         kvm_device_ops_table[type] = ops;
3141         return 0;
3142 }
3143
3144 void kvm_unregister_device_ops(u32 type)
3145 {
3146         if (kvm_device_ops_table[type] != NULL)
3147                 kvm_device_ops_table[type] = NULL;
3148 }
3149
3150 static int kvm_ioctl_create_device(struct kvm *kvm,
3151                                    struct kvm_create_device *cd)
3152 {
3153         struct kvm_device_ops *ops = NULL;
3154         struct kvm_device *dev;
3155         bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
3156         int type;
3157         int ret;
3158
3159         if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
3160                 return -ENODEV;
3161
3162         type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
3163         ops = kvm_device_ops_table[type];
3164         if (ops == NULL)
3165                 return -ENODEV;
3166
3167         if (test)
3168                 return 0;
3169
3170         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
3171         if (!dev)
3172                 return -ENOMEM;
3173
3174         dev->ops = ops;
3175         dev->kvm = kvm;
3176
3177         mutex_lock(&kvm->lock);
3178         ret = ops->create(dev, type);
3179         if (ret < 0) {
3180                 mutex_unlock(&kvm->lock);
3181                 kfree(dev);
3182                 return ret;
3183         }
3184         list_add(&dev->vm_node, &kvm->devices);
3185         mutex_unlock(&kvm->lock);
3186
3187         if (ops->init)
3188                 ops->init(dev);
3189
3190         kvm_get_kvm(kvm);
3191         ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
3192         if (ret < 0) {
3193                 kvm_put_kvm(kvm);
3194                 mutex_lock(&kvm->lock);
3195                 list_del(&dev->vm_node);
3196                 mutex_unlock(&kvm->lock);
3197                 ops->destroy(dev);
3198                 return ret;
3199         }
3200
3201         cd->fd = ret;
3202         return 0;
3203 }
3204
3205 static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
3206 {
3207         switch (arg) {
3208         case KVM_CAP_USER_MEMORY:
3209         case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
3210         case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
3211         case KVM_CAP_INTERNAL_ERROR_DATA:
3212 #ifdef CONFIG_HAVE_KVM_MSI
3213         case KVM_CAP_SIGNAL_MSI:
3214 #endif
3215 #ifdef CONFIG_HAVE_KVM_IRQFD
3216         case KVM_CAP_IRQFD:
3217         case KVM_CAP_IRQFD_RESAMPLE:
3218 #endif
3219         case KVM_CAP_IOEVENTFD_ANY_LENGTH:
3220         case KVM_CAP_CHECK_EXTENSION_VM:
3221                 return 1;
3222 #ifdef CONFIG_KVM_MMIO
3223         case KVM_CAP_COALESCED_MMIO:
3224                 return KVM_COALESCED_MMIO_PAGE_OFFSET;
3225 #endif
3226 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
3227         case KVM_CAP_IRQ_ROUTING:
3228                 return KVM_MAX_IRQ_ROUTES;
3229 #endif
3230 #if KVM_ADDRESS_SPACE_NUM > 1
3231         case KVM_CAP_MULTI_ADDRESS_SPACE:
3232                 return KVM_ADDRESS_SPACE_NUM;
3233 #endif
3234         default:
3235                 break;
3236         }
3237         return kvm_vm_ioctl_check_extension(kvm, arg);
3238 }
3239
3240 static long kvm_vm_ioctl(struct file *filp,
3241                            unsigned int ioctl, unsigned long arg)
3242 {
3243         struct kvm *kvm = filp->private_data;
3244         void __user *argp = (void __user *)arg;
3245         int r;
3246
3247         if (kvm->mm != current->mm || kvm->vm_bugged)
3248                 return -EIO;
3249         switch (ioctl) {
3250         case KVM_CREATE_VCPU:
3251                 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
3252                 break;
3253         case KVM_SET_USER_MEMORY_REGION: {
3254                 struct kvm_userspace_memory_region kvm_userspace_mem;
3255
3256                 r = -EFAULT;
3257                 if (copy_from_user(&kvm_userspace_mem, argp,
3258                                                 sizeof(kvm_userspace_mem)))
3259                         goto out;
3260
3261                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
3262                 break;
3263         }
3264         case KVM_GET_DIRTY_LOG: {
3265                 struct kvm_dirty_log log;
3266
3267                 r = -EFAULT;
3268                 if (copy_from_user(&log, argp, sizeof(log)))
3269                         goto out;
3270                 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
3271                 break;
3272         }
3273 #ifdef CONFIG_KVM_MMIO
3274         case KVM_REGISTER_COALESCED_MMIO: {
3275                 struct kvm_coalesced_mmio_zone zone;
3276
3277                 r = -EFAULT;
3278                 if (copy_from_user(&zone, argp, sizeof(zone)))
3279                         goto out;
3280                 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
3281                 break;
3282         }
3283         case KVM_UNREGISTER_COALESCED_MMIO: {
3284                 struct kvm_coalesced_mmio_zone zone;
3285
3286                 r = -EFAULT;
3287                 if (copy_from_user(&zone, argp, sizeof(zone)))
3288                         goto out;
3289                 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
3290                 break;
3291         }
3292 #endif
3293         case KVM_IRQFD: {
3294                 struct kvm_irqfd data;
3295
3296                 r = -EFAULT;
3297                 if (copy_from_user(&data, argp, sizeof(data)))
3298                         goto out;
3299                 r = kvm_irqfd(kvm, &data);
3300                 break;
3301         }
3302         case KVM_IOEVENTFD: {
3303                 struct kvm_ioeventfd data;
3304
3305                 r = -EFAULT;
3306                 if (copy_from_user(&data, argp, sizeof(data)))
3307                         goto out;
3308                 r = kvm_ioeventfd(kvm, &data);
3309                 break;
3310         }
3311 #ifdef CONFIG_HAVE_KVM_MSI
3312         case KVM_SIGNAL_MSI: {
3313                 struct kvm_msi msi;
3314
3315                 r = -EFAULT;
3316                 if (copy_from_user(&msi, argp, sizeof(msi)))
3317                         goto out;
3318                 r = kvm_send_userspace_msi(kvm, &msi);
3319                 break;
3320         }
3321 #endif
3322 #ifdef __KVM_HAVE_IRQ_LINE
3323         case KVM_IRQ_LINE_STATUS:
3324         case KVM_IRQ_LINE: {
3325                 struct kvm_irq_level irq_event;
3326
3327                 r = -EFAULT;
3328                 if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
3329                         goto out;
3330
3331                 r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
3332                                         ioctl == KVM_IRQ_LINE_STATUS);
3333                 if (r)
3334                         goto out;
3335
3336                 r = -EFAULT;
3337                 if (ioctl == KVM_IRQ_LINE_STATUS) {
3338                         if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
3339                                 goto out;
3340                 }
3341
3342                 r = 0;
3343                 break;
3344         }
3345 #endif
3346 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
3347         case KVM_SET_GSI_ROUTING: {
3348                 struct kvm_irq_routing routing;
3349                 struct kvm_irq_routing __user *urouting;
3350                 struct kvm_irq_routing_entry *entries = NULL;
3351
3352                 r = -EFAULT;
3353                 if (copy_from_user(&routing, argp, sizeof(routing)))
3354                         goto out;
3355                 r = -EINVAL;
3356                 if (!kvm_arch_can_set_irq_routing(kvm))
3357                         goto out;
3358                 if (routing.nr > KVM_MAX_IRQ_ROUTES)
3359                         goto out;
3360                 if (routing.flags)
3361                         goto out;
3362                 if (routing.nr) {
3363                         r = -ENOMEM;
3364                         entries = vmalloc(array_size(sizeof(*entries),
3365                                                      routing.nr));
3366                         if (!entries)
3367                                 goto out;
3368                         r = -EFAULT;
3369                         urouting = argp;
3370                         if (copy_from_user(entries, urouting->entries,
3371                                            routing.nr * sizeof(*entries)))
3372                                 goto out_free_irq_routing;
3373                 }
3374                 r = kvm_set_irq_routing(kvm, entries, routing.nr,
3375                                         routing.flags);
3376 out_free_irq_routing:
3377                 vfree(entries);
3378                 break;
3379         }
3380 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
3381         case KVM_CREATE_DEVICE: {
3382                 struct kvm_create_device cd;
3383
3384                 r = -EFAULT;
3385                 if (copy_from_user(&cd, argp, sizeof(cd)))
3386                         goto out;
3387
3388                 r = kvm_ioctl_create_device(kvm, &cd);
3389                 if (r)
3390                         goto out;
3391
3392                 r = -EFAULT;
3393                 if (copy_to_user(argp, &cd, sizeof(cd)))
3394                         goto out;
3395
3396                 r = 0;
3397                 break;
3398         }
3399         case KVM_CHECK_EXTENSION:
3400                 r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
3401                 break;
3402         default:
3403                 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
3404         }
3405 out:
3406         return r;
3407 }
3408
3409 #ifdef CONFIG_KVM_COMPAT
3410 struct compat_kvm_dirty_log {
3411         __u32 slot;
3412         __u32 padding1;
3413         union {
3414                 compat_uptr_t dirty_bitmap; /* one bit per page */
3415                 __u64 padding2;
3416         };
3417 };
3418
3419 static long kvm_vm_compat_ioctl(struct file *filp,
3420                            unsigned int ioctl, unsigned long arg)
3421 {
3422         struct kvm *kvm = filp->private_data;
3423         int r;
3424
3425         if (kvm->mm != current->mm || kvm->vm_bugged)
3426                 return -EIO;
3427         switch (ioctl) {
3428         case KVM_GET_DIRTY_LOG: {
3429                 struct compat_kvm_dirty_log compat_log;
3430                 struct kvm_dirty_log log;
3431
3432                 if (copy_from_user(&compat_log, (void __user *)arg,
3433                                    sizeof(compat_log)))
3434                         return -EFAULT;
3435                 log.slot         = compat_log.slot;
3436                 log.padding1     = compat_log.padding1;
3437                 log.padding2     = compat_log.padding2;
3438                 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
3439
3440                 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
3441                 break;
3442         }
3443         default:
3444                 r = kvm_vm_ioctl(filp, ioctl, arg);
3445         }
3446         return r;
3447 }
3448 #endif
3449
3450 static struct file_operations kvm_vm_fops = {
3451         .release        = kvm_vm_release,
3452         .unlocked_ioctl = kvm_vm_ioctl,
3453         .llseek         = noop_llseek,
3454         KVM_COMPAT(kvm_vm_compat_ioctl),
3455 };
3456
3457 static int kvm_dev_ioctl_create_vm(unsigned long type)
3458 {
3459         int r;
3460         struct kvm *kvm;
3461         struct file *file;
3462
3463         kvm = kvm_create_vm(type);
3464         if (IS_ERR(kvm))
3465                 return PTR_ERR(kvm);
3466 #ifdef CONFIG_KVM_MMIO
3467         r = kvm_coalesced_mmio_init(kvm);
3468         if (r < 0)
3469                 goto put_kvm;
3470 #endif
3471         r = get_unused_fd_flags(O_CLOEXEC);
3472         if (r < 0)
3473                 goto put_kvm;
3474
3475         file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
3476         if (IS_ERR(file)) {
3477                 put_unused_fd(r);
3478                 r = PTR_ERR(file);
3479                 goto put_kvm;
3480         }
3481
3482         /*
3483          * Don't call kvm_put_kvm anymore at this point; file->f_op is
3484          * already set, with ->release() being kvm_vm_release().  In error
3485          * cases it will be called by the final fput(file) and will take
3486          * care of doing kvm_put_kvm(kvm).
3487          */
3488         if (kvm_create_vm_debugfs(kvm, r) < 0) {
3489                 put_unused_fd(r);
3490                 fput(file);
3491                 return -ENOMEM;
3492         }
3493         kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
3494
3495         fd_install(r, file);
3496         return r;
3497
3498 put_kvm:
3499         kvm_put_kvm(kvm);
3500         return r;
3501 }
3502
3503 static long kvm_dev_ioctl(struct file *filp,
3504                           unsigned int ioctl, unsigned long arg)
3505 {
3506         long r = -EINVAL;
3507
3508         switch (ioctl) {
3509         case KVM_GET_API_VERSION:
3510                 if (arg)
3511                         goto out;
3512                 r = KVM_API_VERSION;
3513                 break;
3514         case KVM_CREATE_VM:
3515                 r = kvm_dev_ioctl_create_vm(arg);
3516                 break;
3517         case KVM_CHECK_EXTENSION:
3518                 r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
3519                 break;
3520         case KVM_GET_VCPU_MMAP_SIZE:
3521                 if (arg)
3522                         goto out;
3523                 r = PAGE_SIZE;     /* struct kvm_run */
3524 #ifdef CONFIG_X86
3525                 r += PAGE_SIZE;    /* pio data page */
3526 #endif
3527 #ifdef CONFIG_KVM_MMIO
3528                 r += PAGE_SIZE;    /* coalesced mmio ring page */
3529 #endif
3530                 break;
3531         case KVM_TRACE_ENABLE:
3532         case KVM_TRACE_PAUSE:
3533         case KVM_TRACE_DISABLE:
3534                 r = -EOPNOTSUPP;
3535                 break;
3536         default:
3537                 return kvm_arch_dev_ioctl(filp, ioctl, arg);
3538         }
3539 out:
3540         return r;
3541 }
3542
3543 static struct file_operations kvm_chardev_ops = {
3544         .unlocked_ioctl = kvm_dev_ioctl,
3545         .llseek         = noop_llseek,
3546         KVM_COMPAT(kvm_dev_ioctl),
3547 };
3548
3549 static struct miscdevice kvm_dev = {
3550         KVM_MINOR,
3551         "kvm",
3552         &kvm_chardev_ops,
3553 };
3554
3555 static void hardware_enable_nolock(void *junk)
3556 {
3557         int cpu = raw_smp_processor_id();
3558         int r;
3559
3560         if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
3561                 return;
3562
3563         cpumask_set_cpu(cpu, cpus_hardware_enabled);
3564
3565         r = kvm_arch_hardware_enable();
3566
3567         if (r) {
3568                 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
3569                 atomic_inc(&hardware_enable_failed);
3570                 pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu);
3571         }
3572 }
3573
3574 static int kvm_starting_cpu(unsigned int cpu)
3575 {
3576         raw_spin_lock(&kvm_count_lock);
3577         if (kvm_usage_count)
3578                 hardware_enable_nolock(NULL);
3579         raw_spin_unlock(&kvm_count_lock);
3580         return 0;
3581 }
3582
3583 static void hardware_disable_nolock(void *junk)
3584 {
3585         int cpu = raw_smp_processor_id();
3586
3587         if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
3588                 return;
3589         cpumask_clear_cpu(cpu, cpus_hardware_enabled);
3590         kvm_arch_hardware_disable();
3591 }
3592
3593 static int kvm_dying_cpu(unsigned int cpu)
3594 {
3595         raw_spin_lock(&kvm_count_lock);
3596         if (kvm_usage_count)
3597                 hardware_disable_nolock(NULL);
3598         raw_spin_unlock(&kvm_count_lock);
3599         return 0;
3600 }
3601
3602 static void hardware_disable_all_nolock(void)
3603 {
3604         BUG_ON(!kvm_usage_count);
3605
3606         kvm_usage_count--;
3607         if (!kvm_usage_count)
3608                 on_each_cpu(hardware_disable_nolock, NULL, 1);
3609 }
3610
3611 static void hardware_disable_all(void)
3612 {
3613         raw_spin_lock(&kvm_count_lock);
3614         hardware_disable_all_nolock();
3615         raw_spin_unlock(&kvm_count_lock);
3616 }
3617
3618 static int hardware_enable_all(void)
3619 {
3620         int r = 0;
3621
3622         raw_spin_lock(&kvm_count_lock);
3623
3624         kvm_usage_count++;
3625         if (kvm_usage_count == 1) {
3626                 atomic_set(&hardware_enable_failed, 0);
3627                 on_each_cpu(hardware_enable_nolock, NULL, 1);
3628
3629                 if (atomic_read(&hardware_enable_failed)) {
3630                         hardware_disable_all_nolock();
3631                         r = -EBUSY;
3632                 }
3633         }
3634
3635         raw_spin_unlock(&kvm_count_lock);
3636
3637         return r;
3638 }
3639
3640 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
3641                       void *v)
3642 {
3643         /*
3644          * Some (well, at least mine) BIOSes hang on reboot if
3645          * in vmx root mode.
3646          *
3647          * And Intel TXT required VMX off for all cpu when system shutdown.
3648          */
3649         pr_info("kvm: exiting hardware virtualization\n");
3650         kvm_rebooting = true;
3651         on_each_cpu(hardware_disable_nolock, NULL, 1);
3652         return NOTIFY_OK;
3653 }
3654
3655 static struct notifier_block kvm_reboot_notifier = {
3656         .notifier_call = kvm_reboot,
3657         .priority = 0,
3658 };
3659
3660 static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
3661 {
3662         int i;
3663
3664         for (i = 0; i < bus->dev_count; i++) {
3665                 struct kvm_io_device *pos = bus->range[i].dev;
3666
3667                 kvm_iodevice_destructor(pos);
3668         }
3669         kfree(bus);
3670 }
3671
3672 static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
3673                                  const struct kvm_io_range *r2)
3674 {
3675         gpa_t addr1 = r1->addr;
3676         gpa_t addr2 = r2->addr;
3677
3678         if (addr1 < addr2)
3679                 return -1;
3680
3681         /* If r2->len == 0, match the exact address.  If r2->len != 0,
3682          * accept any overlapping write.  Any order is acceptable for
3683          * overlapping ranges, because kvm_io_bus_get_first_dev ensures
3684          * we process all of them.
3685          */
3686         if (r2->len) {
3687                 addr1 += r1->len;
3688                 addr2 += r2->len;
3689         }
3690
3691         if (addr1 > addr2)
3692                 return 1;
3693
3694         return 0;
3695 }
3696
3697 static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
3698 {
3699         return kvm_io_bus_cmp(p1, p2);
3700 }
3701
3702 static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
3703                              gpa_t addr, int len)
3704 {
3705         struct kvm_io_range *range, key;
3706         int off;
3707
3708         key = (struct kvm_io_range) {
3709                 .addr = addr,
3710                 .len = len,
3711         };
3712
3713         range = bsearch(&key, bus->range, bus->dev_count,
3714                         sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
3715         if (range == NULL)
3716                 return -ENOENT;
3717
3718         off = range - bus->range;
3719
3720         while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
3721                 off--;
3722
3723         return off;
3724 }
3725
3726 static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
3727                               struct kvm_io_range *range, const void *val)
3728 {
3729         int idx;
3730
3731         idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
3732         if (idx < 0)
3733                 return -EOPNOTSUPP;
3734
3735         while (idx < bus->dev_count &&
3736                 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
3737                 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
3738                                         range->len, val))
3739                         return idx;
3740                 idx++;
3741         }
3742
3743         return -EOPNOTSUPP;
3744 }
3745
3746 /* kvm_io_bus_write - called under kvm->slots_lock */
3747 int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
3748                      int len, const void *val)
3749 {
3750         struct kvm_io_bus *bus;
3751         struct kvm_io_range range;
3752         int r;
3753
3754         range = (struct kvm_io_range) {
3755                 .addr = addr,
3756                 .len = len,
3757         };
3758
3759         bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
3760         if (!bus)
3761                 return -ENOMEM;
3762         r = __kvm_io_bus_write(vcpu, bus, &range, val);
3763         return r < 0 ? r : 0;
3764 }
3765
3766 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */
3767 int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
3768                             gpa_t addr, int len, const void *val, long cookie)
3769 {
3770         struct kvm_io_bus *bus;
3771         struct kvm_io_range range;
3772
3773         range = (struct kvm_io_range) {
3774                 .addr = addr,
3775                 .len = len,
3776         };
3777
3778         bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
3779         if (!bus)
3780                 return -ENOMEM;
3781
3782         /* First try the device referenced by cookie. */
3783         if ((cookie >= 0) && (cookie < bus->dev_count) &&
3784             (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
3785                 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
3786                                         val))
3787                         return cookie;
3788
3789         /*
3790          * cookie contained garbage; fall back to search and return the
3791          * correct cookie value.
3792          */
3793         return __kvm_io_bus_write(vcpu, bus, &range, val);
3794 }
3795
3796 static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
3797                              struct kvm_io_range *range, void *val)
3798 {
3799         int idx;
3800
3801         idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
3802         if (idx < 0)
3803                 return -EOPNOTSUPP;
3804
3805         while (idx < bus->dev_count &&
3806                 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
3807                 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
3808                                        range->len, val))
3809                         return idx;
3810                 idx++;
3811         }
3812
3813         return -EOPNOTSUPP;
3814 }
3815 EXPORT_SYMBOL_GPL(kvm_io_bus_write);
3816
3817 /* kvm_io_bus_read - called under kvm->slots_lock */
3818 int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
3819                     int len, void *val)
3820 {
3821         struct kvm_io_bus *bus;
3822         struct kvm_io_range range;
3823         int r;
3824
3825         range = (struct kvm_io_range) {
3826                 .addr = addr,
3827                 .len = len,
3828         };
3829
3830         bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
3831         if (!bus)
3832                 return -ENOMEM;
3833         r = __kvm_io_bus_read(vcpu, bus, &range, val);
3834         return r < 0 ? r : 0;
3835 }
3836
3837
3838 /* Caller must hold slots_lock. */
3839 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
3840                             int len, struct kvm_io_device *dev)
3841 {
3842         int i;
3843         struct kvm_io_bus *new_bus, *bus;
3844         struct kvm_io_range range;
3845
3846         bus = kvm_get_bus(kvm, bus_idx);
3847         if (!bus)
3848                 return -ENOMEM;
3849
3850         /* exclude ioeventfd which is limited by maximum fd */
3851         if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
3852                 return -ENOSPC;
3853
3854         new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count + 1) *
3855                           sizeof(struct kvm_io_range)), GFP_KERNEL);
3856         if (!new_bus)
3857                 return -ENOMEM;
3858
3859         range = (struct kvm_io_range) {
3860                 .addr = addr,
3861                 .len = len,
3862                 .dev = dev,
3863         };
3864
3865         for (i = 0; i < bus->dev_count; i++)
3866                 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
3867                         break;
3868
3869         memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
3870         new_bus->dev_count++;
3871         new_bus->range[i] = range;
3872         memcpy(new_bus->range + i + 1, bus->range + i,
3873                 (bus->dev_count - i) * sizeof(struct kvm_io_range));
3874         rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
3875         synchronize_srcu_expedited(&kvm->srcu);
3876         kfree(bus);
3877
3878         return 0;
3879 }
3880
3881 /* Caller must hold slots_lock. */
3882 void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
3883                                struct kvm_io_device *dev)
3884 {
3885         int i, j;
3886         struct kvm_io_bus *new_bus, *bus;
3887
3888         bus = kvm_get_bus(kvm, bus_idx);
3889         if (!bus)
3890                 return;
3891
3892         for (i = 0; i < bus->dev_count; i++)
3893                 if (bus->range[i].dev == dev) {
3894                         break;
3895                 }
3896
3897         if (i == bus->dev_count)
3898                 return;
3899
3900         new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) *
3901                           sizeof(struct kvm_io_range)), GFP_KERNEL);
3902         if (new_bus) {
3903                 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
3904                 new_bus->dev_count--;
3905                 memcpy(new_bus->range + i, bus->range + i + 1,
3906                        (new_bus->dev_count - i) * sizeof(struct kvm_io_range));
3907         } else {
3908                 pr_err("kvm: failed to shrink bus, removing it completely\n");
3909                 for (j = 0; j < bus->dev_count; j++) {
3910                         if (j == i)
3911                                 continue;
3912                         kvm_iodevice_destructor(bus->range[j].dev);
3913                 }
3914         }
3915
3916         rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
3917         synchronize_srcu_expedited(&kvm->srcu);
3918         kfree(bus);
3919         return;
3920 }
3921
3922 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
3923                                          gpa_t addr)
3924 {
3925         struct kvm_io_bus *bus;
3926         int dev_idx, srcu_idx;
3927         struct kvm_io_device *iodev = NULL;
3928
3929         srcu_idx = srcu_read_lock(&kvm->srcu);
3930
3931         bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
3932         if (!bus)
3933                 goto out_unlock;
3934
3935         dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
3936         if (dev_idx < 0)
3937                 goto out_unlock;
3938
3939         iodev = bus->range[dev_idx].dev;
3940
3941 out_unlock:
3942         srcu_read_unlock(&kvm->srcu, srcu_idx);
3943
3944         return iodev;
3945 }
3946 EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
3947
3948 static int kvm_debugfs_open(struct inode *inode, struct file *file,
3949                            int (*get)(void *, u64 *), int (*set)(void *, u64),
3950                            const char *fmt)
3951 {
3952         struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
3953                                           inode->i_private;
3954
3955         /* The debugfs files are a reference to the kvm struct which
3956          * is still valid when kvm_destroy_vm is called.
3957          * To avoid the race between open and the removal of the debugfs
3958          * directory we test against the users count.
3959          */
3960         if (!refcount_inc_not_zero(&stat_data->kvm->users_count))
3961                 return -ENOENT;
3962
3963         if (simple_attr_open(inode, file, get,
3964                              stat_data->mode & S_IWUGO ? set : NULL,
3965                              fmt)) {
3966                 kvm_put_kvm(stat_data->kvm);
3967                 return -ENOMEM;
3968         }
3969
3970         return 0;
3971 }
3972
3973 static int kvm_debugfs_release(struct inode *inode, struct file *file)
3974 {
3975         struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
3976                                           inode->i_private;
3977
3978         simple_attr_release(inode, file);
3979         kvm_put_kvm(stat_data->kvm);
3980
3981         return 0;
3982 }
3983
3984 static int vm_stat_get_per_vm(void *data, u64 *val)
3985 {
3986         struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
3987
3988         *val = *(ulong *)((void *)stat_data->kvm + stat_data->offset);
3989
3990         return 0;
3991 }
3992
3993 static int vm_stat_clear_per_vm(void *data, u64 val)
3994 {
3995         struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
3996
3997         if (val)
3998                 return -EINVAL;
3999
4000         *(ulong *)((void *)stat_data->kvm + stat_data->offset) = 0;
4001
4002         return 0;
4003 }
4004
4005 static int vm_stat_get_per_vm_open(struct inode *inode, struct file *file)
4006 {
4007         __simple_attr_check_format("%llu\n", 0ull);
4008         return kvm_debugfs_open(inode, file, vm_stat_get_per_vm,
4009                                 vm_stat_clear_per_vm, "%llu\n");
4010 }
4011
4012 static const struct file_operations vm_stat_get_per_vm_fops = {
4013         .owner   = THIS_MODULE,
4014         .open    = vm_stat_get_per_vm_open,
4015         .release = kvm_debugfs_release,
4016         .read    = simple_attr_read,
4017         .write   = simple_attr_write,
4018         .llseek  = no_llseek,
4019 };
4020
4021 static int vcpu_stat_get_per_vm(void *data, u64 *val)
4022 {
4023         int i;
4024         struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
4025         struct kvm_vcpu *vcpu;
4026
4027         *val = 0;
4028
4029         kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
4030                 *val += *(u64 *)((void *)vcpu + stat_data->offset);
4031
4032         return 0;
4033 }
4034
4035 static int vcpu_stat_clear_per_vm(void *data, u64 val)
4036 {
4037         int i;
4038         struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
4039         struct kvm_vcpu *vcpu;
4040
4041         if (val)
4042                 return -EINVAL;
4043
4044         kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
4045                 *(u64 *)((void *)vcpu + stat_data->offset) = 0;
4046
4047         return 0;
4048 }
4049
4050 static int vcpu_stat_get_per_vm_open(struct inode *inode, struct file *file)
4051 {
4052         __simple_attr_check_format("%llu\n", 0ull);
4053         return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm,
4054                                  vcpu_stat_clear_per_vm, "%llu\n");
4055 }
4056
4057 static const struct file_operations vcpu_stat_get_per_vm_fops = {
4058         .owner   = THIS_MODULE,
4059         .open    = vcpu_stat_get_per_vm_open,
4060         .release = kvm_debugfs_release,
4061         .read    = simple_attr_read,
4062         .write   = simple_attr_write,
4063         .llseek  = no_llseek,
4064 };
4065
4066 static const struct file_operations *stat_fops_per_vm[] = {
4067         [KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops,
4068         [KVM_STAT_VM]   = &vm_stat_get_per_vm_fops,
4069 };
4070
4071 static int vm_stat_get(void *_offset, u64 *val)
4072 {
4073         unsigned offset = (long)_offset;
4074         struct kvm *kvm;
4075         struct kvm_stat_data stat_tmp = {.offset = offset};
4076         u64 tmp_val;
4077
4078         *val = 0;
4079         mutex_lock(&kvm_lock);
4080         list_for_each_entry(kvm, &vm_list, vm_list) {
4081                 stat_tmp.kvm = kvm;
4082                 vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
4083                 *val += tmp_val;
4084         }
4085         mutex_unlock(&kvm_lock);
4086         return 0;
4087 }
4088
4089 static int vm_stat_clear(void *_offset, u64 val)
4090 {
4091         unsigned offset = (long)_offset;
4092         struct kvm *kvm;
4093         struct kvm_stat_data stat_tmp = {.offset = offset};
4094
4095         if (val)
4096                 return -EINVAL;
4097
4098         mutex_lock(&kvm_lock);
4099         list_for_each_entry(kvm, &vm_list, vm_list) {
4100                 stat_tmp.kvm = kvm;
4101                 vm_stat_clear_per_vm((void *)&stat_tmp, 0);
4102         }
4103         mutex_unlock(&kvm_lock);
4104
4105         return 0;
4106 }
4107
4108 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
4109
4110 static int vcpu_stat_get(void *_offset, u64 *val)
4111 {
4112         unsigned offset = (long)_offset;
4113         struct kvm *kvm;
4114         struct kvm_stat_data stat_tmp = {.offset = offset};
4115         u64 tmp_val;
4116
4117         *val = 0;
4118         mutex_lock(&kvm_lock);
4119         list_for_each_entry(kvm, &vm_list, vm_list) {
4120                 stat_tmp.kvm = kvm;
4121                 vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
4122                 *val += tmp_val;
4123         }
4124         mutex_unlock(&kvm_lock);
4125         return 0;
4126 }
4127
4128 static int vcpu_stat_clear(void *_offset, u64 val)
4129 {
4130         unsigned offset = (long)_offset;
4131         struct kvm *kvm;
4132         struct kvm_stat_data stat_tmp = {.offset = offset};
4133
4134         if (val)
4135                 return -EINVAL;
4136
4137         mutex_lock(&kvm_lock);
4138         list_for_each_entry(kvm, &vm_list, vm_list) {
4139                 stat_tmp.kvm = kvm;
4140                 vcpu_stat_clear_per_vm((void *)&stat_tmp, 0);
4141         }
4142         mutex_unlock(&kvm_lock);
4143
4144         return 0;
4145 }
4146
4147 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
4148                         "%llu\n");
4149
4150 static const struct file_operations *stat_fops[] = {
4151         [KVM_STAT_VCPU] = &vcpu_stat_fops,
4152         [KVM_STAT_VM]   = &vm_stat_fops,
4153 };
4154
4155 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
4156 {
4157         struct kobj_uevent_env *env;
4158         unsigned long long created, active;
4159
4160         if (!kvm_dev.this_device || !kvm)
4161                 return;
4162
4163         mutex_lock(&kvm_lock);
4164         if (type == KVM_EVENT_CREATE_VM) {
4165                 kvm_createvm_count++;
4166                 kvm_active_vms++;
4167         } else if (type == KVM_EVENT_DESTROY_VM) {
4168                 kvm_active_vms--;
4169         }
4170         created = kvm_createvm_count;
4171         active = kvm_active_vms;
4172         mutex_unlock(&kvm_lock);
4173
4174         env = kzalloc(sizeof(*env), GFP_KERNEL);
4175         if (!env)
4176                 return;
4177
4178         add_uevent_var(env, "CREATED=%llu", created);
4179         add_uevent_var(env, "COUNT=%llu", active);
4180
4181         if (type == KVM_EVENT_CREATE_VM) {
4182                 add_uevent_var(env, "EVENT=create");
4183                 kvm->userspace_pid = task_pid_nr(current);
4184         } else if (type == KVM_EVENT_DESTROY_VM) {
4185                 add_uevent_var(env, "EVENT=destroy");
4186         }
4187         add_uevent_var(env, "PID=%d", kvm->userspace_pid);
4188
4189         if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) {
4190                 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL);
4191
4192                 if (p) {
4193                         tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
4194                         if (!IS_ERR(tmp))
4195                                 add_uevent_var(env, "STATS_PATH=%s", tmp);
4196                         kfree(p);
4197                 }
4198         }
4199         /* no need for checks, since we are adding at most only 5 keys */
4200         env->envp[env->envp_idx++] = NULL;
4201         kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
4202         kfree(env);
4203 }
4204
4205 static void kvm_init_debug(void)
4206 {
4207         struct kvm_stats_debugfs_item *p;
4208
4209         kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
4210
4211         kvm_debugfs_num_entries = 0;
4212         for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
4213                 int mode = p->mode ? p->mode : 0644;
4214                 debugfs_create_file(p->name, mode, kvm_debugfs_dir,
4215                                     (void *)(long)p->offset,
4216                                     stat_fops[p->kind]);
4217         }
4218 }
4219
4220 static int kvm_suspend(void)
4221 {
4222         if (kvm_usage_count)
4223                 hardware_disable_nolock(NULL);
4224         return 0;
4225 }
4226
4227 static void kvm_resume(void)
4228 {
4229         if (kvm_usage_count) {
4230                 WARN_ON(raw_spin_is_locked(&kvm_count_lock));
4231                 hardware_enable_nolock(NULL);
4232         }
4233 }
4234
4235 static struct syscore_ops kvm_syscore_ops = {
4236         .suspend = kvm_suspend,
4237         .resume = kvm_resume,
4238 };
4239
4240 static inline
4241 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
4242 {
4243         return container_of(pn, struct kvm_vcpu, preempt_notifier);
4244 }
4245
4246 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
4247 {
4248         struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
4249
4250         if (vcpu->preempted)
4251                 vcpu->preempted = false;
4252
4253         kvm_arch_sched_in(vcpu, cpu);
4254
4255         kvm_arch_vcpu_load(vcpu, cpu);
4256 }
4257
4258 static void kvm_sched_out(struct preempt_notifier *pn,
4259                           struct task_struct *next)
4260 {
4261         struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
4262
4263         if (current->state == TASK_RUNNING)
4264                 vcpu->preempted = true;
4265         kvm_arch_vcpu_put(vcpu);
4266 }
4267
4268 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
4269                   struct module *module)
4270 {
4271         int r;
4272         int cpu;
4273
4274         r = kvm_arch_init(opaque);
4275         if (r)
4276                 goto out_fail;
4277
4278         /*
4279          * kvm_arch_init makes sure there's at most one caller
4280          * for architectures that support multiple implementations,
4281          * like intel and amd on x86.
4282          * kvm_arch_init must be called before kvm_irqfd_init to avoid creating
4283          * conflicts in case kvm is already setup for another implementation.
4284          */
4285         r = kvm_irqfd_init();
4286         if (r)
4287                 goto out_irqfd;
4288
4289         if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
4290                 r = -ENOMEM;
4291                 goto out_free_0;
4292         }
4293
4294         r = kvm_arch_hardware_setup();
4295         if (r < 0)
4296                 goto out_free_0a;
4297
4298         for_each_online_cpu(cpu) {
4299                 smp_call_function_single(cpu,
4300                                 kvm_arch_check_processor_compat,
4301                                 &r, 1);
4302                 if (r < 0)
4303                         goto out_free_1;
4304         }
4305
4306         r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
4307                                       kvm_starting_cpu, kvm_dying_cpu);
4308         if (r)
4309                 goto out_free_2;
4310         register_reboot_notifier(&kvm_reboot_notifier);
4311
4312         /* A kmem cache lets us meet the alignment requirements of fx_save. */
4313         if (!vcpu_align)
4314                 vcpu_align = __alignof__(struct kvm_vcpu);
4315         kvm_vcpu_cache =
4316                 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
4317                                            SLAB_ACCOUNT,
4318                                            offsetof(struct kvm_vcpu, arch),
4319                                            sizeof_field(struct kvm_vcpu, arch),
4320                                            NULL);
4321         if (!kvm_vcpu_cache) {
4322                 r = -ENOMEM;
4323                 goto out_free_3;
4324         }
4325
4326         r = kvm_async_pf_init();
4327         if (r)
4328                 goto out_free;
4329
4330         kvm_chardev_ops.owner = module;
4331         kvm_vm_fops.owner = module;
4332         kvm_vcpu_fops.owner = module;
4333
4334         r = misc_register(&kvm_dev);
4335         if (r) {
4336                 pr_err("kvm: misc device register failed\n");
4337                 goto out_unreg;
4338         }
4339
4340         register_syscore_ops(&kvm_syscore_ops);
4341
4342         kvm_preempt_ops.sched_in = kvm_sched_in;
4343         kvm_preempt_ops.sched_out = kvm_sched_out;
4344
4345         kvm_init_debug();
4346
4347         r = kvm_vfio_ops_init();
4348         WARN_ON(r);
4349
4350         return 0;
4351
4352 out_unreg:
4353         kvm_async_pf_deinit();
4354 out_free:
4355         kmem_cache_destroy(kvm_vcpu_cache);
4356 out_free_3:
4357         unregister_reboot_notifier(&kvm_reboot_notifier);
4358         cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
4359 out_free_2:
4360 out_free_1:
4361         kvm_arch_hardware_unsetup();
4362 out_free_0a:
4363         free_cpumask_var(cpus_hardware_enabled);
4364 out_free_0:
4365         kvm_irqfd_exit();
4366 out_irqfd:
4367         kvm_arch_exit();
4368 out_fail:
4369         return r;
4370 }
4371 EXPORT_SYMBOL_GPL(kvm_init);
4372
4373 void kvm_exit(void)
4374 {
4375         debugfs_remove_recursive(kvm_debugfs_dir);
4376         misc_deregister(&kvm_dev);
4377         kmem_cache_destroy(kvm_vcpu_cache);
4378         kvm_async_pf_deinit();
4379         unregister_syscore_ops(&kvm_syscore_ops);
4380         unregister_reboot_notifier(&kvm_reboot_notifier);
4381         cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
4382         on_each_cpu(hardware_disable_nolock, NULL, 1);
4383         kvm_arch_hardware_unsetup();
4384         kvm_arch_exit();
4385         kvm_irqfd_exit();
4386         free_cpumask_var(cpus_hardware_enabled);
4387         kvm_vfio_ops_exit();
4388 }
4389 EXPORT_SYMBOL_GPL(kvm_exit);
4390
4391 struct kvm_vm_worker_thread_context {
4392         struct kvm *kvm;
4393         struct task_struct *parent;
4394         struct completion init_done;
4395         kvm_vm_thread_fn_t thread_fn;
4396         uintptr_t data;
4397         int err;
4398 };
4399
4400 static int kvm_vm_worker_thread(void *context)
4401 {
4402         /*
4403          * The init_context is allocated on the stack of the parent thread, so
4404          * we have to locally copy anything that is needed beyond initialization
4405          */
4406         struct kvm_vm_worker_thread_context *init_context = context;
4407         struct kvm *kvm = init_context->kvm;
4408         kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
4409         uintptr_t data = init_context->data;
4410         int err;
4411
4412         err = kthread_park(current);
4413         /* kthread_park(current) is never supposed to return an error */
4414         WARN_ON(err != 0);
4415         if (err)
4416                 goto init_complete;
4417
4418         err = cgroup_attach_task_all(init_context->parent, current);
4419         if (err) {
4420                 kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
4421                         __func__, err);
4422                 goto init_complete;
4423         }
4424
4425         set_user_nice(current, task_nice(init_context->parent));
4426
4427 init_complete:
4428         init_context->err = err;
4429         complete(&init_context->init_done);
4430         init_context = NULL;
4431
4432         if (err)
4433                 return err;
4434
4435         /* Wait to be woken up by the spawner before proceeding. */
4436         kthread_parkme();
4437
4438         if (!kthread_should_stop())
4439                 err = thread_fn(kvm, data);
4440
4441         return err;
4442 }
4443
4444 int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
4445                                 uintptr_t data, const char *name,
4446                                 struct task_struct **thread_ptr)
4447 {
4448         struct kvm_vm_worker_thread_context init_context = {};
4449         struct task_struct *thread;
4450
4451         *thread_ptr = NULL;
4452         init_context.kvm = kvm;
4453         init_context.parent = current;
4454         init_context.thread_fn = thread_fn;
4455         init_context.data = data;
4456         init_completion(&init_context.init_done);
4457
4458         thread = kthread_run(kvm_vm_worker_thread, &init_context,
4459                              "%s-%d", name, task_pid_nr(current));
4460         if (IS_ERR(thread))
4461                 return PTR_ERR(thread);
4462
4463         /* kthread_run is never supposed to return NULL */
4464         WARN_ON(thread == NULL);
4465
4466         wait_for_completion(&init_context.init_done);
4467
4468         if (!init_context.err)
4469                 *thread_ptr = thread;
4470
4471         return init_context.err;
4472 }