GNU Linux-libre 5.15.54-gnu
[releases.git] / virt / kvm / kvm_main.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Kernel-based Virtual Machine driver for Linux
4  *
5  * This module enables machines with Intel VT-x extensions to run virtual
6  * machines without emulation or binary translation.
7  *
8  * Copyright (C) 2006 Qumranet, Inc.
9  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10  *
11  * Authors:
12  *   Avi Kivity   <avi@qumranet.com>
13  *   Yaniv Kamay  <yaniv@qumranet.com>
14  */
15
16 #include <kvm/iodev.h>
17
18 #include <linux/kvm_host.h>
19 #include <linux/kvm.h>
20 #include <linux/module.h>
21 #include <linux/errno.h>
22 #include <linux/percpu.h>
23 #include <linux/mm.h>
24 #include <linux/miscdevice.h>
25 #include <linux/vmalloc.h>
26 #include <linux/reboot.h>
27 #include <linux/debugfs.h>
28 #include <linux/highmem.h>
29 #include <linux/file.h>
30 #include <linux/syscore_ops.h>
31 #include <linux/cpu.h>
32 #include <linux/sched/signal.h>
33 #include <linux/sched/mm.h>
34 #include <linux/sched/stat.h>
35 #include <linux/cpumask.h>
36 #include <linux/smp.h>
37 #include <linux/anon_inodes.h>
38 #include <linux/profile.h>
39 #include <linux/kvm_para.h>
40 #include <linux/pagemap.h>
41 #include <linux/mman.h>
42 #include <linux/swap.h>
43 #include <linux/bitops.h>
44 #include <linux/spinlock.h>
45 #include <linux/compat.h>
46 #include <linux/srcu.h>
47 #include <linux/hugetlb.h>
48 #include <linux/slab.h>
49 #include <linux/sort.h>
50 #include <linux/bsearch.h>
51 #include <linux/io.h>
52 #include <linux/lockdep.h>
53 #include <linux/kthread.h>
54 #include <linux/suspend.h>
55
56 #include <asm/processor.h>
57 #include <asm/ioctl.h>
58 #include <linux/uaccess.h>
59
60 #include "coalesced_mmio.h"
61 #include "async_pf.h"
62 #include "mmu_lock.h"
63 #include "vfio.h"
64
65 #define CREATE_TRACE_POINTS
66 #include <trace/events/kvm.h>
67
68 #include <linux/kvm_dirty_ring.h>
69
70 /* Worst case buffer size needed for holding an integer. */
71 #define ITOA_MAX_LEN 12
72
73 MODULE_AUTHOR("Qumranet");
74 MODULE_LICENSE("GPL");
75
76 /* Architectures should define their poll value according to the halt latency */
77 unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
78 module_param(halt_poll_ns, uint, 0644);
79 EXPORT_SYMBOL_GPL(halt_poll_ns);
80
81 /* Default doubles per-vcpu halt_poll_ns. */
82 unsigned int halt_poll_ns_grow = 2;
83 module_param(halt_poll_ns_grow, uint, 0644);
84 EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
85
86 /* The start value to grow halt_poll_ns from */
87 unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
88 module_param(halt_poll_ns_grow_start, uint, 0644);
89 EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
90
91 /* Default resets per-vcpu halt_poll_ns . */
92 unsigned int halt_poll_ns_shrink;
93 module_param(halt_poll_ns_shrink, uint, 0644);
94 EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
95
96 /*
97  * Ordering of locks:
98  *
99  *      kvm->lock --> kvm->slots_lock --> kvm->irq_lock
100  */
101
102 DEFINE_MUTEX(kvm_lock);
103 static DEFINE_RAW_SPINLOCK(kvm_count_lock);
104 LIST_HEAD(vm_list);
105
106 static cpumask_var_t cpus_hardware_enabled;
107 static int kvm_usage_count;
108 static atomic_t hardware_enable_failed;
109
110 static struct kmem_cache *kvm_vcpu_cache;
111
112 static __read_mostly struct preempt_ops kvm_preempt_ops;
113 static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
114
115 struct dentry *kvm_debugfs_dir;
116 EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
117
118 static const struct file_operations stat_fops_per_vm;
119
120 static struct file_operations kvm_chardev_ops;
121
122 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
123                            unsigned long arg);
124 #ifdef CONFIG_KVM_COMPAT
125 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
126                                   unsigned long arg);
127 #define KVM_COMPAT(c)   .compat_ioctl   = (c)
128 #else
129 /*
130  * For architectures that don't implement a compat infrastructure,
131  * adopt a double line of defense:
132  * - Prevent a compat task from opening /dev/kvm
133  * - If the open has been done by a 64bit task, and the KVM fd
134  *   passed to a compat task, let the ioctls fail.
135  */
136 static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
137                                 unsigned long arg) { return -EINVAL; }
138
139 static int kvm_no_compat_open(struct inode *inode, struct file *file)
140 {
141         return is_compat_task() ? -ENODEV : 0;
142 }
143 #define KVM_COMPAT(c)   .compat_ioctl   = kvm_no_compat_ioctl,  \
144                         .open           = kvm_no_compat_open
145 #endif
146 static int hardware_enable_all(void);
147 static void hardware_disable_all(void);
148
149 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
150
151 __visible bool kvm_rebooting;
152 EXPORT_SYMBOL_GPL(kvm_rebooting);
153
154 #define KVM_EVENT_CREATE_VM 0
155 #define KVM_EVENT_DESTROY_VM 1
156 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
157 static unsigned long long kvm_createvm_count;
158 static unsigned long long kvm_active_vms;
159
160 __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
161                                                    unsigned long start, unsigned long end)
162 {
163 }
164
165 bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
166 {
167         /*
168          * The metadata used by is_zone_device_page() to determine whether or
169          * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
170          * the device has been pinned, e.g. by get_user_pages().  WARN if the
171          * page_count() is zero to help detect bad usage of this helper.
172          */
173         if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
174                 return false;
175
176         return is_zone_device_page(pfn_to_page(pfn));
177 }
178
179 bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
180 {
181         /*
182          * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
183          * perspective they are "normal" pages, albeit with slightly different
184          * usage rules.
185          */
186         if (pfn_valid(pfn))
187                 return PageReserved(pfn_to_page(pfn)) &&
188                        !is_zero_pfn(pfn) &&
189                        !kvm_is_zone_device_pfn(pfn);
190
191         return true;
192 }
193
194 /*
195  * Switches to specified vcpu, until a matching vcpu_put()
196  */
197 void vcpu_load(struct kvm_vcpu *vcpu)
198 {
199         int cpu = get_cpu();
200
201         __this_cpu_write(kvm_running_vcpu, vcpu);
202         preempt_notifier_register(&vcpu->preempt_notifier);
203         kvm_arch_vcpu_load(vcpu, cpu);
204         put_cpu();
205 }
206 EXPORT_SYMBOL_GPL(vcpu_load);
207
208 void vcpu_put(struct kvm_vcpu *vcpu)
209 {
210         preempt_disable();
211         kvm_arch_vcpu_put(vcpu);
212         preempt_notifier_unregister(&vcpu->preempt_notifier);
213         __this_cpu_write(kvm_running_vcpu, NULL);
214         preempt_enable();
215 }
216 EXPORT_SYMBOL_GPL(vcpu_put);
217
218 /* TODO: merge with kvm_arch_vcpu_should_kick */
219 static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
220 {
221         int mode = kvm_vcpu_exiting_guest_mode(vcpu);
222
223         /*
224          * We need to wait for the VCPU to reenable interrupts and get out of
225          * READING_SHADOW_PAGE_TABLES mode.
226          */
227         if (req & KVM_REQUEST_WAIT)
228                 return mode != OUTSIDE_GUEST_MODE;
229
230         /*
231          * Need to kick a running VCPU, but otherwise there is nothing to do.
232          */
233         return mode == IN_GUEST_MODE;
234 }
235
236 static void ack_flush(void *_completed)
237 {
238 }
239
240 static inline bool kvm_kick_many_cpus(cpumask_var_t tmp, bool wait)
241 {
242         const struct cpumask *cpus;
243
244         if (likely(cpumask_available(tmp)))
245                 cpus = tmp;
246         else
247                 cpus = cpu_online_mask;
248
249         if (cpumask_empty(cpus))
250                 return false;
251
252         smp_call_function_many(cpus, ack_flush, NULL, wait);
253         return true;
254 }
255
256 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
257                                  struct kvm_vcpu *except,
258                                  unsigned long *vcpu_bitmap, cpumask_var_t tmp)
259 {
260         int i, cpu, me;
261         struct kvm_vcpu *vcpu;
262         bool called;
263
264         me = get_cpu();
265
266         kvm_for_each_vcpu(i, vcpu, kvm) {
267                 if ((vcpu_bitmap && !test_bit(i, vcpu_bitmap)) ||
268                     vcpu == except)
269                         continue;
270
271                 kvm_make_request(req, vcpu);
272
273                 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
274                         continue;
275
276                 /*
277                  * tmp can be "unavailable" if cpumasks are allocated off stack
278                  * as allocation of the mask is deliberately not fatal and is
279                  * handled by falling back to kicking all online CPUs.
280                  */
281                 if (!cpumask_available(tmp))
282                         continue;
283
284                 /*
285                  * Note, the vCPU could get migrated to a different pCPU at any
286                  * point after kvm_request_needs_ipi(), which could result in
287                  * sending an IPI to the previous pCPU.  But, that's ok because
288                  * the purpose of the IPI is to ensure the vCPU returns to
289                  * OUTSIDE_GUEST_MODE, which is satisfied if the vCPU migrates.
290                  * Entering READING_SHADOW_PAGE_TABLES after this point is also
291                  * ok, as the requirement is only that KVM wait for vCPUs that
292                  * were reading SPTEs _before_ any changes were finalized.  See
293                  * kvm_vcpu_kick() for more details on handling requests.
294                  */
295                 if (kvm_request_needs_ipi(vcpu, req)) {
296                         cpu = READ_ONCE(vcpu->cpu);
297                         if (cpu != -1 && cpu != me)
298                                 __cpumask_set_cpu(cpu, tmp);
299                 }
300         }
301
302         called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT));
303         put_cpu();
304
305         return called;
306 }
307
308 bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
309                                       struct kvm_vcpu *except)
310 {
311         cpumask_var_t cpus;
312         bool called;
313
314         zalloc_cpumask_var(&cpus, GFP_ATOMIC);
315
316         called = kvm_make_vcpus_request_mask(kvm, req, except, NULL, cpus);
317
318         free_cpumask_var(cpus);
319         return called;
320 }
321
322 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
323 {
324         return kvm_make_all_cpus_request_except(kvm, req, NULL);
325 }
326 EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
327
328 #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
329 void kvm_flush_remote_tlbs(struct kvm *kvm)
330 {
331         ++kvm->stat.generic.remote_tlb_flush_requests;
332
333         /*
334          * We want to publish modifications to the page tables before reading
335          * mode. Pairs with a memory barrier in arch-specific code.
336          * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
337          * and smp_mb in walk_shadow_page_lockless_begin/end.
338          * - powerpc: smp_mb in kvmppc_prepare_to_enter.
339          *
340          * There is already an smp_mb__after_atomic() before
341          * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
342          * barrier here.
343          */
344         if (!kvm_arch_flush_remote_tlb(kvm)
345             || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
346                 ++kvm->stat.generic.remote_tlb_flush;
347 }
348 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
349 #endif
350
351 void kvm_reload_remote_mmus(struct kvm *kvm)
352 {
353         kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
354 }
355
356 #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
357 static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
358                                                gfp_t gfp_flags)
359 {
360         gfp_flags |= mc->gfp_zero;
361
362         if (mc->kmem_cache)
363                 return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
364         else
365                 return (void *)__get_free_page(gfp_flags);
366 }
367
368 int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
369 {
370         void *obj;
371
372         if (mc->nobjs >= min)
373                 return 0;
374         while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
375                 obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
376                 if (!obj)
377                         return mc->nobjs >= min ? 0 : -ENOMEM;
378                 mc->objects[mc->nobjs++] = obj;
379         }
380         return 0;
381 }
382
383 int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
384 {
385         return mc->nobjs;
386 }
387
388 void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
389 {
390         while (mc->nobjs) {
391                 if (mc->kmem_cache)
392                         kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
393                 else
394                         free_page((unsigned long)mc->objects[--mc->nobjs]);
395         }
396 }
397
398 void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
399 {
400         void *p;
401
402         if (WARN_ON(!mc->nobjs))
403                 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
404         else
405                 p = mc->objects[--mc->nobjs];
406         BUG_ON(!p);
407         return p;
408 }
409 #endif
410
411 static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
412 {
413         mutex_init(&vcpu->mutex);
414         vcpu->cpu = -1;
415         vcpu->kvm = kvm;
416         vcpu->vcpu_id = id;
417         vcpu->pid = NULL;
418         rcuwait_init(&vcpu->wait);
419         kvm_async_pf_vcpu_init(vcpu);
420
421         vcpu->pre_pcpu = -1;
422         INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
423
424         kvm_vcpu_set_in_spin_loop(vcpu, false);
425         kvm_vcpu_set_dy_eligible(vcpu, false);
426         vcpu->preempted = false;
427         vcpu->ready = false;
428         preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
429         vcpu->last_used_slot = 0;
430 }
431
432 void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
433 {
434         kvm_arch_vcpu_destroy(vcpu);
435         kvm_dirty_ring_free(&vcpu->dirty_ring);
436
437         /*
438          * No need for rcu_read_lock as VCPU_RUN is the only place that changes
439          * the vcpu->pid pointer, and at destruction time all file descriptors
440          * are already gone.
441          */
442         put_pid(rcu_dereference_protected(vcpu->pid, 1));
443
444         free_page((unsigned long)vcpu->run);
445         kmem_cache_free(kvm_vcpu_cache, vcpu);
446 }
447 EXPORT_SYMBOL_GPL(kvm_vcpu_destroy);
448
449 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
450 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
451 {
452         return container_of(mn, struct kvm, mmu_notifier);
453 }
454
455 static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
456                                               struct mm_struct *mm,
457                                               unsigned long start, unsigned long end)
458 {
459         struct kvm *kvm = mmu_notifier_to_kvm(mn);
460         int idx;
461
462         idx = srcu_read_lock(&kvm->srcu);
463         kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
464         srcu_read_unlock(&kvm->srcu, idx);
465 }
466
467 typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);
468
469 typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start,
470                              unsigned long end);
471
472 struct kvm_hva_range {
473         unsigned long start;
474         unsigned long end;
475         pte_t pte;
476         hva_handler_t handler;
477         on_lock_fn_t on_lock;
478         bool flush_on_ret;
479         bool may_block;
480 };
481
482 /*
483  * Use a dedicated stub instead of NULL to indicate that there is no callback
484  * function/handler.  The compiler technically can't guarantee that a real
485  * function will have a non-zero address, and so it will generate code to
486  * check for !NULL, whereas comparing against a stub will be elided at compile
487  * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
488  */
489 static void kvm_null_fn(void)
490 {
491
492 }
493 #define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
494
495 static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
496                                                   const struct kvm_hva_range *range)
497 {
498         bool ret = false, locked = false;
499         struct kvm_gfn_range gfn_range;
500         struct kvm_memory_slot *slot;
501         struct kvm_memslots *slots;
502         int i, idx;
503
504         /* A null handler is allowed if and only if on_lock() is provided. */
505         if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
506                          IS_KVM_NULL_FN(range->handler)))
507                 return 0;
508
509         idx = srcu_read_lock(&kvm->srcu);
510
511         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
512                 slots = __kvm_memslots(kvm, i);
513                 kvm_for_each_memslot(slot, slots) {
514                         unsigned long hva_start, hva_end;
515
516                         hva_start = max(range->start, slot->userspace_addr);
517                         hva_end = min(range->end, slot->userspace_addr +
518                                                   (slot->npages << PAGE_SHIFT));
519                         if (hva_start >= hva_end)
520                                 continue;
521
522                         /*
523                          * To optimize for the likely case where the address
524                          * range is covered by zero or one memslots, don't
525                          * bother making these conditional (to avoid writes on
526                          * the second or later invocation of the handler).
527                          */
528                         gfn_range.pte = range->pte;
529                         gfn_range.may_block = range->may_block;
530
531                         /*
532                          * {gfn(page) | page intersects with [hva_start, hva_end)} =
533                          * {gfn_start, gfn_start+1, ..., gfn_end-1}.
534                          */
535                         gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
536                         gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
537                         gfn_range.slot = slot;
538
539                         if (!locked) {
540                                 locked = true;
541                                 KVM_MMU_LOCK(kvm);
542                                 if (!IS_KVM_NULL_FN(range->on_lock))
543                                         range->on_lock(kvm, range->start, range->end);
544                                 if (IS_KVM_NULL_FN(range->handler))
545                                         break;
546                         }
547                         ret |= range->handler(kvm, &gfn_range);
548                 }
549         }
550
551         if (range->flush_on_ret && ret)
552                 kvm_flush_remote_tlbs(kvm);
553
554         if (locked)
555                 KVM_MMU_UNLOCK(kvm);
556
557         srcu_read_unlock(&kvm->srcu, idx);
558
559         /* The notifiers are averse to booleans. :-( */
560         return (int)ret;
561 }
562
563 static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
564                                                 unsigned long start,
565                                                 unsigned long end,
566                                                 pte_t pte,
567                                                 hva_handler_t handler)
568 {
569         struct kvm *kvm = mmu_notifier_to_kvm(mn);
570         const struct kvm_hva_range range = {
571                 .start          = start,
572                 .end            = end,
573                 .pte            = pte,
574                 .handler        = handler,
575                 .on_lock        = (void *)kvm_null_fn,
576                 .flush_on_ret   = true,
577                 .may_block      = false,
578         };
579
580         return __kvm_handle_hva_range(kvm, &range);
581 }
582
583 static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
584                                                          unsigned long start,
585                                                          unsigned long end,
586                                                          hva_handler_t handler)
587 {
588         struct kvm *kvm = mmu_notifier_to_kvm(mn);
589         const struct kvm_hva_range range = {
590                 .start          = start,
591                 .end            = end,
592                 .pte            = __pte(0),
593                 .handler        = handler,
594                 .on_lock        = (void *)kvm_null_fn,
595                 .flush_on_ret   = false,
596                 .may_block      = false,
597         };
598
599         return __kvm_handle_hva_range(kvm, &range);
600 }
601 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
602                                         struct mm_struct *mm,
603                                         unsigned long address,
604                                         pte_t pte)
605 {
606         struct kvm *kvm = mmu_notifier_to_kvm(mn);
607
608         trace_kvm_set_spte_hva(address);
609
610         /*
611          * .change_pte() must be surrounded by .invalidate_range_{start,end}().
612          * If mmu_notifier_count is zero, then no in-progress invalidations,
613          * including this one, found a relevant memslot at start(); rechecking
614          * memslots here is unnecessary.  Note, a false positive (count elevated
615          * by a different invalidation) is sub-optimal but functionally ok.
616          */
617         WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
618         if (!READ_ONCE(kvm->mmu_notifier_count))
619                 return;
620
621         kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
622 }
623
624 void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start,
625                                    unsigned long end)
626 {
627         /*
628          * The count increase must become visible at unlock time as no
629          * spte can be established without taking the mmu_lock and
630          * count is also read inside the mmu_lock critical section.
631          */
632         kvm->mmu_notifier_count++;
633         if (likely(kvm->mmu_notifier_count == 1)) {
634                 kvm->mmu_notifier_range_start = start;
635                 kvm->mmu_notifier_range_end = end;
636         } else {
637                 /*
638                  * Fully tracking multiple concurrent ranges has dimishing
639                  * returns. Keep things simple and just find the minimal range
640                  * which includes the current and new ranges. As there won't be
641                  * enough information to subtract a range after its invalidate
642                  * completes, any ranges invalidated concurrently will
643                  * accumulate and persist until all outstanding invalidates
644                  * complete.
645                  */
646                 kvm->mmu_notifier_range_start =
647                         min(kvm->mmu_notifier_range_start, start);
648                 kvm->mmu_notifier_range_end =
649                         max(kvm->mmu_notifier_range_end, end);
650         }
651 }
652
653 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
654                                         const struct mmu_notifier_range *range)
655 {
656         struct kvm *kvm = mmu_notifier_to_kvm(mn);
657         const struct kvm_hva_range hva_range = {
658                 .start          = range->start,
659                 .end            = range->end,
660                 .pte            = __pte(0),
661                 .handler        = kvm_unmap_gfn_range,
662                 .on_lock        = kvm_inc_notifier_count,
663                 .flush_on_ret   = true,
664                 .may_block      = mmu_notifier_range_blockable(range),
665         };
666
667         trace_kvm_unmap_hva_range(range->start, range->end);
668
669         /*
670          * Prevent memslot modification between range_start() and range_end()
671          * so that conditionally locking provides the same result in both
672          * functions.  Without that guarantee, the mmu_notifier_count
673          * adjustments will be imbalanced.
674          *
675          * Pairs with the decrement in range_end().
676          */
677         spin_lock(&kvm->mn_invalidate_lock);
678         kvm->mn_active_invalidate_count++;
679         spin_unlock(&kvm->mn_invalidate_lock);
680
681         __kvm_handle_hva_range(kvm, &hva_range);
682
683         return 0;
684 }
685
686 void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start,
687                                    unsigned long end)
688 {
689         /*
690          * This sequence increase will notify the kvm page fault that
691          * the page that is going to be mapped in the spte could have
692          * been freed.
693          */
694         kvm->mmu_notifier_seq++;
695         smp_wmb();
696         /*
697          * The above sequence increase must be visible before the
698          * below count decrease, which is ensured by the smp_wmb above
699          * in conjunction with the smp_rmb in mmu_notifier_retry().
700          */
701         kvm->mmu_notifier_count--;
702 }
703
704 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
705                                         const struct mmu_notifier_range *range)
706 {
707         struct kvm *kvm = mmu_notifier_to_kvm(mn);
708         const struct kvm_hva_range hva_range = {
709                 .start          = range->start,
710                 .end            = range->end,
711                 .pte            = __pte(0),
712                 .handler        = (void *)kvm_null_fn,
713                 .on_lock        = kvm_dec_notifier_count,
714                 .flush_on_ret   = false,
715                 .may_block      = mmu_notifier_range_blockable(range),
716         };
717         bool wake;
718
719         __kvm_handle_hva_range(kvm, &hva_range);
720
721         /* Pairs with the increment in range_start(). */
722         spin_lock(&kvm->mn_invalidate_lock);
723         wake = (--kvm->mn_active_invalidate_count == 0);
724         spin_unlock(&kvm->mn_invalidate_lock);
725
726         /*
727          * There can only be one waiter, since the wait happens under
728          * slots_lock.
729          */
730         if (wake)
731                 rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
732
733         BUG_ON(kvm->mmu_notifier_count < 0);
734 }
735
736 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
737                                               struct mm_struct *mm,
738                                               unsigned long start,
739                                               unsigned long end)
740 {
741         trace_kvm_age_hva(start, end);
742
743         return kvm_handle_hva_range(mn, start, end, __pte(0), kvm_age_gfn);
744 }
745
746 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
747                                         struct mm_struct *mm,
748                                         unsigned long start,
749                                         unsigned long end)
750 {
751         trace_kvm_age_hva(start, end);
752
753         /*
754          * Even though we do not flush TLB, this will still adversely
755          * affect performance on pre-Haswell Intel EPT, where there is
756          * no EPT Access Bit to clear so that we have to tear down EPT
757          * tables instead. If we find this unacceptable, we can always
758          * add a parameter to kvm_age_hva so that it effectively doesn't
759          * do anything on clear_young.
760          *
761          * Also note that currently we never issue secondary TLB flushes
762          * from clear_young, leaving this job up to the regular system
763          * cadence. If we find this inaccurate, we might come up with a
764          * more sophisticated heuristic later.
765          */
766         return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
767 }
768
769 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
770                                        struct mm_struct *mm,
771                                        unsigned long address)
772 {
773         trace_kvm_test_age_hva(address);
774
775         return kvm_handle_hva_range_no_flush(mn, address, address + 1,
776                                              kvm_test_age_gfn);
777 }
778
779 static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
780                                      struct mm_struct *mm)
781 {
782         struct kvm *kvm = mmu_notifier_to_kvm(mn);
783         int idx;
784
785         idx = srcu_read_lock(&kvm->srcu);
786         kvm_arch_flush_shadow_all(kvm);
787         srcu_read_unlock(&kvm->srcu, idx);
788 }
789
790 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
791         .invalidate_range       = kvm_mmu_notifier_invalidate_range,
792         .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
793         .invalidate_range_end   = kvm_mmu_notifier_invalidate_range_end,
794         .clear_flush_young      = kvm_mmu_notifier_clear_flush_young,
795         .clear_young            = kvm_mmu_notifier_clear_young,
796         .test_young             = kvm_mmu_notifier_test_young,
797         .change_pte             = kvm_mmu_notifier_change_pte,
798         .release                = kvm_mmu_notifier_release,
799 };
800
801 static int kvm_init_mmu_notifier(struct kvm *kvm)
802 {
803         kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
804         return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
805 }
806
807 #else  /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
808
809 static int kvm_init_mmu_notifier(struct kvm *kvm)
810 {
811         return 0;
812 }
813
814 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
815
816 #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
817 static int kvm_pm_notifier_call(struct notifier_block *bl,
818                                 unsigned long state,
819                                 void *unused)
820 {
821         struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);
822
823         return kvm_arch_pm_notifier(kvm, state);
824 }
825
826 static void kvm_init_pm_notifier(struct kvm *kvm)
827 {
828         kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
829         /* Suspend KVM before we suspend ftrace, RCU, etc. */
830         kvm->pm_notifier.priority = INT_MAX;
831         register_pm_notifier(&kvm->pm_notifier);
832 }
833
834 static void kvm_destroy_pm_notifier(struct kvm *kvm)
835 {
836         unregister_pm_notifier(&kvm->pm_notifier);
837 }
838 #else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
839 static void kvm_init_pm_notifier(struct kvm *kvm)
840 {
841 }
842
843 static void kvm_destroy_pm_notifier(struct kvm *kvm)
844 {
845 }
846 #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
847
848 static struct kvm_memslots *kvm_alloc_memslots(void)
849 {
850         int i;
851         struct kvm_memslots *slots;
852
853         slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
854         if (!slots)
855                 return NULL;
856
857         for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
858                 slots->id_to_index[i] = -1;
859
860         return slots;
861 }
862
863 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
864 {
865         if (!memslot->dirty_bitmap)
866                 return;
867
868         kvfree(memslot->dirty_bitmap);
869         memslot->dirty_bitmap = NULL;
870 }
871
872 static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
873 {
874         kvm_destroy_dirty_bitmap(slot);
875
876         kvm_arch_free_memslot(kvm, slot);
877
878         slot->flags = 0;
879         slot->npages = 0;
880 }
881
882 static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
883 {
884         struct kvm_memory_slot *memslot;
885
886         if (!slots)
887                 return;
888
889         kvm_for_each_memslot(memslot, slots)
890                 kvm_free_memslot(kvm, memslot);
891
892         kvfree(slots);
893 }
894
895 static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
896 {
897         switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
898         case KVM_STATS_TYPE_INSTANT:
899                 return 0444;
900         case KVM_STATS_TYPE_CUMULATIVE:
901         case KVM_STATS_TYPE_PEAK:
902         default:
903                 return 0644;
904         }
905 }
906
907
908 static void kvm_destroy_vm_debugfs(struct kvm *kvm)
909 {
910         int i;
911         int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
912                                       kvm_vcpu_stats_header.num_desc;
913
914         if (IS_ERR(kvm->debugfs_dentry))
915                 return;
916
917         debugfs_remove_recursive(kvm->debugfs_dentry);
918
919         if (kvm->debugfs_stat_data) {
920                 for (i = 0; i < kvm_debugfs_num_entries; i++)
921                         kfree(kvm->debugfs_stat_data[i]);
922                 kfree(kvm->debugfs_stat_data);
923         }
924 }
925
926 static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
927 {
928         static DEFINE_MUTEX(kvm_debugfs_lock);
929         struct dentry *dent;
930         char dir_name[ITOA_MAX_LEN * 2];
931         struct kvm_stat_data *stat_data;
932         const struct _kvm_stats_desc *pdesc;
933         int i, ret;
934         int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
935                                       kvm_vcpu_stats_header.num_desc;
936
937         if (!debugfs_initialized())
938                 return 0;
939
940         snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
941         mutex_lock(&kvm_debugfs_lock);
942         dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
943         if (dent) {
944                 pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
945                 dput(dent);
946                 mutex_unlock(&kvm_debugfs_lock);
947                 return 0;
948         }
949         dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
950         mutex_unlock(&kvm_debugfs_lock);
951         if (IS_ERR(dent))
952                 return 0;
953
954         kvm->debugfs_dentry = dent;
955         kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
956                                          sizeof(*kvm->debugfs_stat_data),
957                                          GFP_KERNEL_ACCOUNT);
958         if (!kvm->debugfs_stat_data)
959                 return -ENOMEM;
960
961         for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
962                 pdesc = &kvm_vm_stats_desc[i];
963                 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
964                 if (!stat_data)
965                         return -ENOMEM;
966
967                 stat_data->kvm = kvm;
968                 stat_data->desc = pdesc;
969                 stat_data->kind = KVM_STAT_VM;
970                 kvm->debugfs_stat_data[i] = stat_data;
971                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
972                                     kvm->debugfs_dentry, stat_data,
973                                     &stat_fops_per_vm);
974         }
975
976         for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
977                 pdesc = &kvm_vcpu_stats_desc[i];
978                 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
979                 if (!stat_data)
980                         return -ENOMEM;
981
982                 stat_data->kvm = kvm;
983                 stat_data->desc = pdesc;
984                 stat_data->kind = KVM_STAT_VCPU;
985                 kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
986                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
987                                     kvm->debugfs_dentry, stat_data,
988                                     &stat_fops_per_vm);
989         }
990
991         ret = kvm_arch_create_vm_debugfs(kvm);
992         if (ret) {
993                 kvm_destroy_vm_debugfs(kvm);
994                 return i;
995         }
996
997         return 0;
998 }
999
1000 /*
1001  * Called after the VM is otherwise initialized, but just before adding it to
1002  * the vm_list.
1003  */
1004 int __weak kvm_arch_post_init_vm(struct kvm *kvm)
1005 {
1006         return 0;
1007 }
1008
1009 /*
1010  * Called just after removing the VM from the vm_list, but before doing any
1011  * other destruction.
1012  */
1013 void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
1014 {
1015 }
1016
1017 /*
1018  * Called after per-vm debugfs created.  When called kvm->debugfs_dentry should
1019  * be setup already, so we can create arch-specific debugfs entries under it.
1020  * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
1021  * a per-arch destroy interface is not needed.
1022  */
1023 int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
1024 {
1025         return 0;
1026 }
1027
1028 static struct kvm *kvm_create_vm(unsigned long type)
1029 {
1030         struct kvm *kvm = kvm_arch_alloc_vm();
1031         int r = -ENOMEM;
1032         int i;
1033
1034         if (!kvm)
1035                 return ERR_PTR(-ENOMEM);
1036
1037         KVM_MMU_LOCK_INIT(kvm);
1038         mmgrab(current->mm);
1039         kvm->mm = current->mm;
1040         kvm_eventfd_init(kvm);
1041         mutex_init(&kvm->lock);
1042         mutex_init(&kvm->irq_lock);
1043         mutex_init(&kvm->slots_lock);
1044         mutex_init(&kvm->slots_arch_lock);
1045         spin_lock_init(&kvm->mn_invalidate_lock);
1046         rcuwait_init(&kvm->mn_memslots_update_rcuwait);
1047
1048         INIT_LIST_HEAD(&kvm->devices);
1049
1050         BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
1051
1052         /*
1053          * Force subsequent debugfs file creations to fail if the VM directory
1054          * is not created (by kvm_create_vm_debugfs()).
1055          */
1056         kvm->debugfs_dentry = ERR_PTR(-ENOENT);
1057
1058         if (init_srcu_struct(&kvm->srcu))
1059                 goto out_err_no_srcu;
1060         if (init_srcu_struct(&kvm->irq_srcu))
1061                 goto out_err_no_irq_srcu;
1062
1063         refcount_set(&kvm->users_count, 1);
1064         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1065                 struct kvm_memslots *slots = kvm_alloc_memslots();
1066
1067                 if (!slots)
1068                         goto out_err_no_arch_destroy_vm;
1069                 /* Generations must be different for each address space. */
1070                 slots->generation = i;
1071                 rcu_assign_pointer(kvm->memslots[i], slots);
1072         }
1073
1074         for (i = 0; i < KVM_NR_BUSES; i++) {
1075                 rcu_assign_pointer(kvm->buses[i],
1076                         kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
1077                 if (!kvm->buses[i])
1078                         goto out_err_no_arch_destroy_vm;
1079         }
1080
1081         kvm->max_halt_poll_ns = halt_poll_ns;
1082
1083         r = kvm_arch_init_vm(kvm, type);
1084         if (r)
1085                 goto out_err_no_arch_destroy_vm;
1086
1087         r = hardware_enable_all();
1088         if (r)
1089                 goto out_err_no_disable;
1090
1091 #ifdef CONFIG_HAVE_KVM_IRQFD
1092         INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
1093 #endif
1094
1095         r = kvm_init_mmu_notifier(kvm);
1096         if (r)
1097                 goto out_err_no_mmu_notifier;
1098
1099         r = kvm_arch_post_init_vm(kvm);
1100         if (r)
1101                 goto out_err;
1102
1103         mutex_lock(&kvm_lock);
1104         list_add(&kvm->vm_list, &vm_list);
1105         mutex_unlock(&kvm_lock);
1106
1107         preempt_notifier_inc();
1108         kvm_init_pm_notifier(kvm);
1109
1110         /*
1111          * When the fd passed to this ioctl() is opened it pins the module,
1112          * but try_module_get() also prevents getting a reference if the module
1113          * is in MODULE_STATE_GOING (e.g. if someone ran "rmmod --wait").
1114          */
1115         if (!try_module_get(kvm_chardev_ops.owner)) {
1116                 r = -ENODEV;
1117                 goto out_err;
1118         }
1119
1120         return kvm;
1121
1122 out_err:
1123 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1124         if (kvm->mmu_notifier.ops)
1125                 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
1126 #endif
1127 out_err_no_mmu_notifier:
1128         hardware_disable_all();
1129 out_err_no_disable:
1130         kvm_arch_destroy_vm(kvm);
1131 out_err_no_arch_destroy_vm:
1132         WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
1133         for (i = 0; i < KVM_NR_BUSES; i++)
1134                 kfree(kvm_get_bus(kvm, i));
1135         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
1136                 kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
1137         cleanup_srcu_struct(&kvm->irq_srcu);
1138 out_err_no_irq_srcu:
1139         cleanup_srcu_struct(&kvm->srcu);
1140 out_err_no_srcu:
1141         kvm_arch_free_vm(kvm);
1142         mmdrop(current->mm);
1143         return ERR_PTR(r);
1144 }
1145
1146 static void kvm_destroy_devices(struct kvm *kvm)
1147 {
1148         struct kvm_device *dev, *tmp;
1149
1150         /*
1151          * We do not need to take the kvm->lock here, because nobody else
1152          * has a reference to the struct kvm at this point and therefore
1153          * cannot access the devices list anyhow.
1154          */
1155         list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
1156                 list_del(&dev->vm_node);
1157                 dev->ops->destroy(dev);
1158         }
1159 }
1160
1161 static void kvm_destroy_vm(struct kvm *kvm)
1162 {
1163         int i;
1164         struct mm_struct *mm = kvm->mm;
1165
1166         kvm_destroy_pm_notifier(kvm);
1167         kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
1168         kvm_destroy_vm_debugfs(kvm);
1169         kvm_arch_sync_events(kvm);
1170         mutex_lock(&kvm_lock);
1171         list_del(&kvm->vm_list);
1172         mutex_unlock(&kvm_lock);
1173         kvm_arch_pre_destroy_vm(kvm);
1174
1175         kvm_free_irq_routing(kvm);
1176         for (i = 0; i < KVM_NR_BUSES; i++) {
1177                 struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
1178
1179                 if (bus)
1180                         kvm_io_bus_destroy(bus);
1181                 kvm->buses[i] = NULL;
1182         }
1183         kvm_coalesced_mmio_free(kvm);
1184 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1185         mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
1186         /*
1187          * At this point, pending calls to invalidate_range_start()
1188          * have completed but no more MMU notifiers will run, so
1189          * mn_active_invalidate_count may remain unbalanced.
1190          * No threads can be waiting in install_new_memslots as the
1191          * last reference on KVM has been dropped, but freeing
1192          * memslots would deadlock without this manual intervention.
1193          */
1194         WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
1195         kvm->mn_active_invalidate_count = 0;
1196 #else
1197         kvm_arch_flush_shadow_all(kvm);
1198 #endif
1199         kvm_arch_destroy_vm(kvm);
1200         kvm_destroy_devices(kvm);
1201         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
1202                 kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
1203         cleanup_srcu_struct(&kvm->irq_srcu);
1204         cleanup_srcu_struct(&kvm->srcu);
1205         kvm_arch_free_vm(kvm);
1206         preempt_notifier_dec();
1207         hardware_disable_all();
1208         mmdrop(mm);
1209         module_put(kvm_chardev_ops.owner);
1210 }
1211
1212 void kvm_get_kvm(struct kvm *kvm)
1213 {
1214         refcount_inc(&kvm->users_count);
1215 }
1216 EXPORT_SYMBOL_GPL(kvm_get_kvm);
1217
1218 /*
1219  * Make sure the vm is not during destruction, which is a safe version of
1220  * kvm_get_kvm().  Return true if kvm referenced successfully, false otherwise.
1221  */
1222 bool kvm_get_kvm_safe(struct kvm *kvm)
1223 {
1224         return refcount_inc_not_zero(&kvm->users_count);
1225 }
1226 EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
1227
1228 void kvm_put_kvm(struct kvm *kvm)
1229 {
1230         if (refcount_dec_and_test(&kvm->users_count))
1231                 kvm_destroy_vm(kvm);
1232 }
1233 EXPORT_SYMBOL_GPL(kvm_put_kvm);
1234
1235 /*
1236  * Used to put a reference that was taken on behalf of an object associated
1237  * with a user-visible file descriptor, e.g. a vcpu or device, if installation
1238  * of the new file descriptor fails and the reference cannot be transferred to
1239  * its final owner.  In such cases, the caller is still actively using @kvm and
1240  * will fail miserably if the refcount unexpectedly hits zero.
1241  */
1242 void kvm_put_kvm_no_destroy(struct kvm *kvm)
1243 {
1244         WARN_ON(refcount_dec_and_test(&kvm->users_count));
1245 }
1246 EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
1247
1248 static int kvm_vm_release(struct inode *inode, struct file *filp)
1249 {
1250         struct kvm *kvm = filp->private_data;
1251
1252         kvm_irqfd_release(kvm);
1253
1254         kvm_put_kvm(kvm);
1255         return 0;
1256 }
1257
1258 /*
1259  * Allocation size is twice as large as the actual dirty bitmap size.
1260  * See kvm_vm_ioctl_get_dirty_log() why this is needed.
1261  */
1262 static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
1263 {
1264         unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot);
1265
1266         memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT);
1267         if (!memslot->dirty_bitmap)
1268                 return -ENOMEM;
1269
1270         return 0;
1271 }
1272
1273 /*
1274  * Delete a memslot by decrementing the number of used slots and shifting all
1275  * other entries in the array forward one spot.
1276  */
1277 static inline void kvm_memslot_delete(struct kvm_memslots *slots,
1278                                       struct kvm_memory_slot *memslot)
1279 {
1280         struct kvm_memory_slot *mslots = slots->memslots;
1281         int i;
1282
1283         if (WARN_ON(slots->id_to_index[memslot->id] == -1))
1284                 return;
1285
1286         slots->used_slots--;
1287
1288         if (atomic_read(&slots->last_used_slot) >= slots->used_slots)
1289                 atomic_set(&slots->last_used_slot, 0);
1290
1291         for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) {
1292                 mslots[i] = mslots[i + 1];
1293                 slots->id_to_index[mslots[i].id] = i;
1294         }
1295         mslots[i] = *memslot;
1296         slots->id_to_index[memslot->id] = -1;
1297 }
1298
1299 /*
1300  * "Insert" a new memslot by incrementing the number of used slots.  Returns
1301  * the new slot's initial index into the memslots array.
1302  */
1303 static inline int kvm_memslot_insert_back(struct kvm_memslots *slots)
1304 {
1305         return slots->used_slots++;
1306 }
1307
1308 /*
1309  * Move a changed memslot backwards in the array by shifting existing slots
1310  * with a higher GFN toward the front of the array.  Note, the changed memslot
1311  * itself is not preserved in the array, i.e. not swapped at this time, only
1312  * its new index into the array is tracked.  Returns the changed memslot's
1313  * current index into the memslots array.
1314  */
1315 static inline int kvm_memslot_move_backward(struct kvm_memslots *slots,
1316                                             struct kvm_memory_slot *memslot)
1317 {
1318         struct kvm_memory_slot *mslots = slots->memslots;
1319         int i;
1320
1321         if (WARN_ON_ONCE(slots->id_to_index[memslot->id] == -1) ||
1322             WARN_ON_ONCE(!slots->used_slots))
1323                 return -1;
1324
1325         /*
1326          * Move the target memslot backward in the array by shifting existing
1327          * memslots with a higher GFN (than the target memslot) towards the
1328          * front of the array.
1329          */
1330         for (i = slots->id_to_index[memslot->id]; i < slots->used_slots - 1; i++) {
1331                 if (memslot->base_gfn > mslots[i + 1].base_gfn)
1332                         break;
1333
1334                 WARN_ON_ONCE(memslot->base_gfn == mslots[i + 1].base_gfn);
1335
1336                 /* Shift the next memslot forward one and update its index. */
1337                 mslots[i] = mslots[i + 1];
1338                 slots->id_to_index[mslots[i].id] = i;
1339         }
1340         return i;
1341 }
1342
1343 /*
1344  * Move a changed memslot forwards in the array by shifting existing slots with
1345  * a lower GFN toward the back of the array.  Note, the changed memslot itself
1346  * is not preserved in the array, i.e. not swapped at this time, only its new
1347  * index into the array is tracked.  Returns the changed memslot's final index
1348  * into the memslots array.
1349  */
1350 static inline int kvm_memslot_move_forward(struct kvm_memslots *slots,
1351                                            struct kvm_memory_slot *memslot,
1352                                            int start)
1353 {
1354         struct kvm_memory_slot *mslots = slots->memslots;
1355         int i;
1356
1357         for (i = start; i > 0; i--) {
1358                 if (memslot->base_gfn < mslots[i - 1].base_gfn)
1359                         break;
1360
1361                 WARN_ON_ONCE(memslot->base_gfn == mslots[i - 1].base_gfn);
1362
1363                 /* Shift the next memslot back one and update its index. */
1364                 mslots[i] = mslots[i - 1];
1365                 slots->id_to_index[mslots[i].id] = i;
1366         }
1367         return i;
1368 }
1369
1370 /*
1371  * Re-sort memslots based on their GFN to account for an added, deleted, or
1372  * moved memslot.  Sorting memslots by GFN allows using a binary search during
1373  * memslot lookup.
1374  *
1375  * IMPORTANT: Slots are sorted from highest GFN to lowest GFN!  I.e. the entry
1376  * at memslots[0] has the highest GFN.
1377  *
1378  * The sorting algorithm takes advantage of having initially sorted memslots
1379  * and knowing the position of the changed memslot.  Sorting is also optimized
1380  * by not swapping the updated memslot and instead only shifting other memslots
1381  * and tracking the new index for the update memslot.  Only once its final
1382  * index is known is the updated memslot copied into its position in the array.
1383  *
1384  *  - When deleting a memslot, the deleted memslot simply needs to be moved to
1385  *    the end of the array.
1386  *
1387  *  - When creating a memslot, the algorithm "inserts" the new memslot at the
1388  *    end of the array and then it forward to its correct location.
1389  *
1390  *  - When moving a memslot, the algorithm first moves the updated memslot
1391  *    backward to handle the scenario where the memslot's GFN was changed to a
1392  *    lower value.  update_memslots() then falls through and runs the same flow
1393  *    as creating a memslot to move the memslot forward to handle the scenario
1394  *    where its GFN was changed to a higher value.
1395  *
1396  * Note, slots are sorted from highest->lowest instead of lowest->highest for
1397  * historical reasons.  Originally, invalid memslots where denoted by having
1398  * GFN=0, thus sorting from highest->lowest naturally sorted invalid memslots
1399  * to the end of the array.  The current algorithm uses dedicated logic to
1400  * delete a memslot and thus does not rely on invalid memslots having GFN=0.
1401  *
1402  * The other historical motiviation for highest->lowest was to improve the
1403  * performance of memslot lookup.  KVM originally used a linear search starting
1404  * at memslots[0].  On x86, the largest memslot usually has one of the highest,
1405  * if not *the* highest, GFN, as the bulk of the guest's RAM is located in a
1406  * single memslot above the 4gb boundary.  As the largest memslot is also the
1407  * most likely to be referenced, sorting it to the front of the array was
1408  * advantageous.  The current binary search starts from the middle of the array
1409  * and uses an LRU pointer to improve performance for all memslots and GFNs.
1410  */
1411 static void update_memslots(struct kvm_memslots *slots,
1412                             struct kvm_memory_slot *memslot,
1413                             enum kvm_mr_change change)
1414 {
1415         int i;
1416
1417         if (change == KVM_MR_DELETE) {
1418                 kvm_memslot_delete(slots, memslot);
1419         } else {
1420                 if (change == KVM_MR_CREATE)
1421                         i = kvm_memslot_insert_back(slots);
1422                 else
1423                         i = kvm_memslot_move_backward(slots, memslot);
1424                 i = kvm_memslot_move_forward(slots, memslot, i);
1425
1426                 /*
1427                  * Copy the memslot to its new position in memslots and update
1428                  * its index accordingly.
1429                  */
1430                 slots->memslots[i] = *memslot;
1431                 slots->id_to_index[memslot->id] = i;
1432         }
1433 }
1434
1435 static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
1436 {
1437         u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
1438
1439 #ifdef __KVM_HAVE_READONLY_MEM
1440         valid_flags |= KVM_MEM_READONLY;
1441 #endif
1442
1443         if (mem->flags & ~valid_flags)
1444                 return -EINVAL;
1445
1446         return 0;
1447 }
1448
1449 static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
1450                 int as_id, struct kvm_memslots *slots)
1451 {
1452         struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
1453         u64 gen = old_memslots->generation;
1454
1455         WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1456         slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1457
1458         /*
1459          * Do not store the new memslots while there are invalidations in
1460          * progress, otherwise the locking in invalidate_range_start and
1461          * invalidate_range_end will be unbalanced.
1462          */
1463         spin_lock(&kvm->mn_invalidate_lock);
1464         prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
1465         while (kvm->mn_active_invalidate_count) {
1466                 set_current_state(TASK_UNINTERRUPTIBLE);
1467                 spin_unlock(&kvm->mn_invalidate_lock);
1468                 schedule();
1469                 spin_lock(&kvm->mn_invalidate_lock);
1470         }
1471         finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
1472         rcu_assign_pointer(kvm->memslots[as_id], slots);
1473         spin_unlock(&kvm->mn_invalidate_lock);
1474
1475         /*
1476          * Acquired in kvm_set_memslot. Must be released before synchronize
1477          * SRCU below in order to avoid deadlock with another thread
1478          * acquiring the slots_arch_lock in an srcu critical section.
1479          */
1480         mutex_unlock(&kvm->slots_arch_lock);
1481
1482         synchronize_srcu_expedited(&kvm->srcu);
1483
1484         /*
1485          * Increment the new memslot generation a second time, dropping the
1486          * update in-progress flag and incrementing the generation based on
1487          * the number of address spaces.  This provides a unique and easily
1488          * identifiable generation number while the memslots are in flux.
1489          */
1490         gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1491
1492         /*
1493          * Generations must be unique even across address spaces.  We do not need
1494          * a global counter for that, instead the generation space is evenly split
1495          * across address spaces.  For example, with two address spaces, address
1496          * space 0 will use generations 0, 2, 4, ... while address space 1 will
1497          * use generations 1, 3, 5, ...
1498          */
1499         gen += KVM_ADDRESS_SPACE_NUM;
1500
1501         kvm_arch_memslots_updated(kvm, gen);
1502
1503         slots->generation = gen;
1504
1505         return old_memslots;
1506 }
1507
1508 static size_t kvm_memslots_size(int slots)
1509 {
1510         return sizeof(struct kvm_memslots) +
1511                (sizeof(struct kvm_memory_slot) * slots);
1512 }
1513
1514 static void kvm_copy_memslots(struct kvm_memslots *to,
1515                               struct kvm_memslots *from)
1516 {
1517         memcpy(to, from, kvm_memslots_size(from->used_slots));
1518 }
1519
1520 /*
1521  * Note, at a minimum, the current number of used slots must be allocated, even
1522  * when deleting a memslot, as we need a complete duplicate of the memslots for
1523  * use when invalidating a memslot prior to deleting/moving the memslot.
1524  */
1525 static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old,
1526                                              enum kvm_mr_change change)
1527 {
1528         struct kvm_memslots *slots;
1529         size_t new_size;
1530
1531         if (change == KVM_MR_CREATE)
1532                 new_size = kvm_memslots_size(old->used_slots + 1);
1533         else
1534                 new_size = kvm_memslots_size(old->used_slots);
1535
1536         slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT);
1537         if (likely(slots))
1538                 kvm_copy_memslots(slots, old);
1539
1540         return slots;
1541 }
1542
1543 static int kvm_set_memslot(struct kvm *kvm,
1544                            const struct kvm_userspace_memory_region *mem,
1545                            struct kvm_memory_slot *new, int as_id,
1546                            enum kvm_mr_change change)
1547 {
1548         struct kvm_memory_slot *slot, old;
1549         struct kvm_memslots *slots;
1550         int r;
1551
1552         /*
1553          * Released in install_new_memslots.
1554          *
1555          * Must be held from before the current memslots are copied until
1556          * after the new memslots are installed with rcu_assign_pointer,
1557          * then released before the synchronize srcu in install_new_memslots.
1558          *
1559          * When modifying memslots outside of the slots_lock, must be held
1560          * before reading the pointer to the current memslots until after all
1561          * changes to those memslots are complete.
1562          *
1563          * These rules ensure that installing new memslots does not lose
1564          * changes made to the previous memslots.
1565          */
1566         mutex_lock(&kvm->slots_arch_lock);
1567
1568         slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change);
1569         if (!slots) {
1570                 mutex_unlock(&kvm->slots_arch_lock);
1571                 return -ENOMEM;
1572         }
1573
1574         if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1575                 /*
1576                  * Note, the INVALID flag needs to be in the appropriate entry
1577                  * in the freshly allocated memslots, not in @old or @new.
1578                  */
1579                 slot = id_to_memslot(slots, new->id);
1580                 slot->flags |= KVM_MEMSLOT_INVALID;
1581
1582                 /*
1583                  * We can re-use the memory from the old memslots.
1584                  * It will be overwritten with a copy of the new memslots
1585                  * after reacquiring the slots_arch_lock below.
1586                  */
1587                 slots = install_new_memslots(kvm, as_id, slots);
1588
1589                 /* From this point no new shadow pages pointing to a deleted,
1590                  * or moved, memslot will be created.
1591                  *
1592                  * validation of sp->gfn happens in:
1593                  *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1594                  *      - kvm_is_visible_gfn (mmu_check_root)
1595                  */
1596                 kvm_arch_flush_shadow_memslot(kvm, slot);
1597
1598                 /* Released in install_new_memslots. */
1599                 mutex_lock(&kvm->slots_arch_lock);
1600
1601                 /*
1602                  * The arch-specific fields of the memslots could have changed
1603                  * between releasing the slots_arch_lock in
1604                  * install_new_memslots and here, so get a fresh copy of the
1605                  * slots.
1606                  */
1607                 kvm_copy_memslots(slots, __kvm_memslots(kvm, as_id));
1608         }
1609
1610         /*
1611          * Make a full copy of the old memslot, the pointer will become stale
1612          * when the memslots are re-sorted by update_memslots(), and the old
1613          * memslot needs to be referenced after calling update_memslots(), e.g.
1614          * to free its resources and for arch specific behavior.  This needs to
1615          * happen *after* (re)acquiring slots_arch_lock.
1616          */
1617         slot = id_to_memslot(slots, new->id);
1618         if (slot) {
1619                 old = *slot;
1620         } else {
1621                 WARN_ON_ONCE(change != KVM_MR_CREATE);
1622                 memset(&old, 0, sizeof(old));
1623                 old.id = new->id;
1624                 old.as_id = as_id;
1625         }
1626
1627         /* Copy the arch-specific data, again after (re)acquiring slots_arch_lock. */
1628         memcpy(&new->arch, &old.arch, sizeof(old.arch));
1629
1630         r = kvm_arch_prepare_memory_region(kvm, new, mem, change);
1631         if (r)
1632                 goto out_slots;
1633
1634         update_memslots(slots, new, change);
1635         slots = install_new_memslots(kvm, as_id, slots);
1636
1637         kvm_arch_commit_memory_region(kvm, mem, &old, new, change);
1638
1639         /* Free the old memslot's metadata.  Note, this is the full copy!!! */
1640         if (change == KVM_MR_DELETE)
1641                 kvm_free_memslot(kvm, &old);
1642
1643         kvfree(slots);
1644         return 0;
1645
1646 out_slots:
1647         if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
1648                 slot = id_to_memslot(slots, new->id);
1649                 slot->flags &= ~KVM_MEMSLOT_INVALID;
1650                 slots = install_new_memslots(kvm, as_id, slots);
1651         } else {
1652                 mutex_unlock(&kvm->slots_arch_lock);
1653         }
1654         kvfree(slots);
1655         return r;
1656 }
1657
1658 static int kvm_delete_memslot(struct kvm *kvm,
1659                               const struct kvm_userspace_memory_region *mem,
1660                               struct kvm_memory_slot *old, int as_id)
1661 {
1662         struct kvm_memory_slot new;
1663
1664         if (!old->npages)
1665                 return -EINVAL;
1666
1667         memset(&new, 0, sizeof(new));
1668         new.id = old->id;
1669         /*
1670          * This is only for debugging purpose; it should never be referenced
1671          * for a removed memslot.
1672          */
1673         new.as_id = as_id;
1674
1675         return kvm_set_memslot(kvm, mem, &new, as_id, KVM_MR_DELETE);
1676 }
1677
1678 /*
1679  * Allocate some memory and give it an address in the guest physical address
1680  * space.
1681  *
1682  * Discontiguous memory is allowed, mostly for framebuffers.
1683  *
1684  * Must be called holding kvm->slots_lock for write.
1685  */
1686 int __kvm_set_memory_region(struct kvm *kvm,
1687                             const struct kvm_userspace_memory_region *mem)
1688 {
1689         struct kvm_memory_slot old, new;
1690         struct kvm_memory_slot *tmp;
1691         enum kvm_mr_change change;
1692         int as_id, id;
1693         int r;
1694
1695         r = check_memory_region_flags(mem);
1696         if (r)
1697                 return r;
1698
1699         as_id = mem->slot >> 16;
1700         id = (u16)mem->slot;
1701
1702         /* General sanity checks */
1703         if ((mem->memory_size & (PAGE_SIZE - 1)) ||
1704             (mem->memory_size != (unsigned long)mem->memory_size))
1705                 return -EINVAL;
1706         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
1707                 return -EINVAL;
1708         /* We can read the guest memory with __xxx_user() later on. */
1709         if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
1710             (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
1711              !access_ok((void __user *)(unsigned long)mem->userspace_addr,
1712                         mem->memory_size))
1713                 return -EINVAL;
1714         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
1715                 return -EINVAL;
1716         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1717                 return -EINVAL;
1718
1719         /*
1720          * Make a full copy of the old memslot, the pointer will become stale
1721          * when the memslots are re-sorted by update_memslots(), and the old
1722          * memslot needs to be referenced after calling update_memslots(), e.g.
1723          * to free its resources and for arch specific behavior.
1724          */
1725         tmp = id_to_memslot(__kvm_memslots(kvm, as_id), id);
1726         if (tmp) {
1727                 old = *tmp;
1728                 tmp = NULL;
1729         } else {
1730                 memset(&old, 0, sizeof(old));
1731                 old.id = id;
1732         }
1733
1734         if (!mem->memory_size)
1735                 return kvm_delete_memslot(kvm, mem, &old, as_id);
1736
1737         new.as_id = as_id;
1738         new.id = id;
1739         new.base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
1740         new.npages = mem->memory_size >> PAGE_SHIFT;
1741         new.flags = mem->flags;
1742         new.userspace_addr = mem->userspace_addr;
1743
1744         if (new.npages > KVM_MEM_MAX_NR_PAGES)
1745                 return -EINVAL;
1746
1747         if (!old.npages) {
1748                 change = KVM_MR_CREATE;
1749                 new.dirty_bitmap = NULL;
1750         } else { /* Modify an existing slot. */
1751                 if ((new.userspace_addr != old.userspace_addr) ||
1752                     (new.npages != old.npages) ||
1753                     ((new.flags ^ old.flags) & KVM_MEM_READONLY))
1754                         return -EINVAL;
1755
1756                 if (new.base_gfn != old.base_gfn)
1757                         change = KVM_MR_MOVE;
1758                 else if (new.flags != old.flags)
1759                         change = KVM_MR_FLAGS_ONLY;
1760                 else /* Nothing to change. */
1761                         return 0;
1762
1763                 /* Copy dirty_bitmap from the current memslot. */
1764                 new.dirty_bitmap = old.dirty_bitmap;
1765         }
1766
1767         if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
1768                 /* Check for overlaps */
1769                 kvm_for_each_memslot(tmp, __kvm_memslots(kvm, as_id)) {
1770                         if (tmp->id == id)
1771                                 continue;
1772                         if (!((new.base_gfn + new.npages <= tmp->base_gfn) ||
1773                               (new.base_gfn >= tmp->base_gfn + tmp->npages)))
1774                                 return -EEXIST;
1775                 }
1776         }
1777
1778         /* Allocate/free page dirty bitmap as needed */
1779         if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
1780                 new.dirty_bitmap = NULL;
1781         else if (!new.dirty_bitmap && !kvm->dirty_ring_size) {
1782                 r = kvm_alloc_dirty_bitmap(&new);
1783                 if (r)
1784                         return r;
1785
1786                 if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1787                         bitmap_set(new.dirty_bitmap, 0, new.npages);
1788         }
1789
1790         r = kvm_set_memslot(kvm, mem, &new, as_id, change);
1791         if (r)
1792                 goto out_bitmap;
1793
1794         if (old.dirty_bitmap && !new.dirty_bitmap)
1795                 kvm_destroy_dirty_bitmap(&old);
1796         return 0;
1797
1798 out_bitmap:
1799         if (new.dirty_bitmap && !old.dirty_bitmap)
1800                 kvm_destroy_dirty_bitmap(&new);
1801         return r;
1802 }
1803 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
1804
1805 int kvm_set_memory_region(struct kvm *kvm,
1806                           const struct kvm_userspace_memory_region *mem)
1807 {
1808         int r;
1809
1810         mutex_lock(&kvm->slots_lock);
1811         r = __kvm_set_memory_region(kvm, mem);
1812         mutex_unlock(&kvm->slots_lock);
1813         return r;
1814 }
1815 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
1816
1817 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
1818                                           struct kvm_userspace_memory_region *mem)
1819 {
1820         if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
1821                 return -EINVAL;
1822
1823         return kvm_set_memory_region(kvm, mem);
1824 }
1825
1826 #ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
1827 /**
1828  * kvm_get_dirty_log - get a snapshot of dirty pages
1829  * @kvm:        pointer to kvm instance
1830  * @log:        slot id and address to which we copy the log
1831  * @is_dirty:   set to '1' if any dirty pages were found
1832  * @memslot:    set to the associated memslot, always valid on success
1833  */
1834 int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
1835                       int *is_dirty, struct kvm_memory_slot **memslot)
1836 {
1837         struct kvm_memslots *slots;
1838         int i, as_id, id;
1839         unsigned long n;
1840         unsigned long any = 0;
1841
1842         /* Dirty ring tracking is exclusive to dirty log tracking */
1843         if (kvm->dirty_ring_size)
1844                 return -ENXIO;
1845
1846         *memslot = NULL;
1847         *is_dirty = 0;
1848
1849         as_id = log->slot >> 16;
1850         id = (u16)log->slot;
1851         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1852                 return -EINVAL;
1853
1854         slots = __kvm_memslots(kvm, as_id);
1855         *memslot = id_to_memslot(slots, id);
1856         if (!(*memslot) || !(*memslot)->dirty_bitmap)
1857                 return -ENOENT;
1858
1859         kvm_arch_sync_dirty_log(kvm, *memslot);
1860
1861         n = kvm_dirty_bitmap_bytes(*memslot);
1862
1863         for (i = 0; !any && i < n/sizeof(long); ++i)
1864                 any = (*memslot)->dirty_bitmap[i];
1865
1866         if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
1867                 return -EFAULT;
1868
1869         if (any)
1870                 *is_dirty = 1;
1871         return 0;
1872 }
1873 EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
1874
1875 #else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
1876 /**
1877  * kvm_get_dirty_log_protect - get a snapshot of dirty pages
1878  *      and reenable dirty page tracking for the corresponding pages.
1879  * @kvm:        pointer to kvm instance
1880  * @log:        slot id and address to which we copy the log
1881  *
1882  * We need to keep it in mind that VCPU threads can write to the bitmap
1883  * concurrently. So, to avoid losing track of dirty pages we keep the
1884  * following order:
1885  *
1886  *    1. Take a snapshot of the bit and clear it if needed.
1887  *    2. Write protect the corresponding page.
1888  *    3. Copy the snapshot to the userspace.
1889  *    4. Upon return caller flushes TLB's if needed.
1890  *
1891  * Between 2 and 4, the guest may write to the page using the remaining TLB
1892  * entry.  This is not a problem because the page is reported dirty using
1893  * the snapshot taken before and step 4 ensures that writes done after
1894  * exiting to userspace will be logged for the next call.
1895  *
1896  */
1897 static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
1898 {
1899         struct kvm_memslots *slots;
1900         struct kvm_memory_slot *memslot;
1901         int i, as_id, id;
1902         unsigned long n;
1903         unsigned long *dirty_bitmap;
1904         unsigned long *dirty_bitmap_buffer;
1905         bool flush;
1906
1907         /* Dirty ring tracking is exclusive to dirty log tracking */
1908         if (kvm->dirty_ring_size)
1909                 return -ENXIO;
1910
1911         as_id = log->slot >> 16;
1912         id = (u16)log->slot;
1913         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
1914                 return -EINVAL;
1915
1916         slots = __kvm_memslots(kvm, as_id);
1917         memslot = id_to_memslot(slots, id);
1918         if (!memslot || !memslot->dirty_bitmap)
1919                 return -ENOENT;
1920
1921         dirty_bitmap = memslot->dirty_bitmap;
1922
1923         kvm_arch_sync_dirty_log(kvm, memslot);
1924
1925         n = kvm_dirty_bitmap_bytes(memslot);
1926         flush = false;
1927         if (kvm->manual_dirty_log_protect) {
1928                 /*
1929                  * Unlike kvm_get_dirty_log, we always return false in *flush,
1930                  * because no flush is needed until KVM_CLEAR_DIRTY_LOG.  There
1931                  * is some code duplication between this function and
1932                  * kvm_get_dirty_log, but hopefully all architecture
1933                  * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
1934                  * can be eliminated.
1935                  */
1936                 dirty_bitmap_buffer = dirty_bitmap;
1937         } else {
1938                 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
1939                 memset(dirty_bitmap_buffer, 0, n);
1940
1941                 KVM_MMU_LOCK(kvm);
1942                 for (i = 0; i < n / sizeof(long); i++) {
1943                         unsigned long mask;
1944                         gfn_t offset;
1945
1946                         if (!dirty_bitmap[i])
1947                                 continue;
1948
1949                         flush = true;
1950                         mask = xchg(&dirty_bitmap[i], 0);
1951                         dirty_bitmap_buffer[i] = mask;
1952
1953                         offset = i * BITS_PER_LONG;
1954                         kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
1955                                                                 offset, mask);
1956                 }
1957                 KVM_MMU_UNLOCK(kvm);
1958         }
1959
1960         if (flush)
1961                 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
1962
1963         if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
1964                 return -EFAULT;
1965         return 0;
1966 }
1967
1968
1969 /**
1970  * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
1971  * @kvm: kvm instance
1972  * @log: slot id and address to which we copy the log
1973  *
1974  * Steps 1-4 below provide general overview of dirty page logging. See
1975  * kvm_get_dirty_log_protect() function description for additional details.
1976  *
1977  * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
1978  * always flush the TLB (step 4) even if previous step failed  and the dirty
1979  * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
1980  * does not preclude user space subsequent dirty log read. Flushing TLB ensures
1981  * writes will be marked dirty for next log read.
1982  *
1983  *   1. Take a snapshot of the bit and clear it if needed.
1984  *   2. Write protect the corresponding page.
1985  *   3. Copy the snapshot to the userspace.
1986  *   4. Flush TLB's if needed.
1987  */
1988 static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1989                                       struct kvm_dirty_log *log)
1990 {
1991         int r;
1992
1993         mutex_lock(&kvm->slots_lock);
1994
1995         r = kvm_get_dirty_log_protect(kvm, log);
1996
1997         mutex_unlock(&kvm->slots_lock);
1998         return r;
1999 }
2000
2001 /**
2002  * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
2003  *      and reenable dirty page tracking for the corresponding pages.
2004  * @kvm:        pointer to kvm instance
2005  * @log:        slot id and address from which to fetch the bitmap of dirty pages
2006  */
2007 static int kvm_clear_dirty_log_protect(struct kvm *kvm,
2008                                        struct kvm_clear_dirty_log *log)
2009 {
2010         struct kvm_memslots *slots;
2011         struct kvm_memory_slot *memslot;
2012         int as_id, id;
2013         gfn_t offset;
2014         unsigned long i, n;
2015         unsigned long *dirty_bitmap;
2016         unsigned long *dirty_bitmap_buffer;
2017         bool flush;
2018
2019         /* Dirty ring tracking is exclusive to dirty log tracking */
2020         if (kvm->dirty_ring_size)
2021                 return -ENXIO;
2022
2023         as_id = log->slot >> 16;
2024         id = (u16)log->slot;
2025         if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
2026                 return -EINVAL;
2027
2028         if (log->first_page & 63)
2029                 return -EINVAL;
2030
2031         slots = __kvm_memslots(kvm, as_id);
2032         memslot = id_to_memslot(slots, id);
2033         if (!memslot || !memslot->dirty_bitmap)
2034                 return -ENOENT;
2035
2036         dirty_bitmap = memslot->dirty_bitmap;
2037
2038         n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;
2039
2040         if (log->first_page > memslot->npages ||
2041             log->num_pages > memslot->npages - log->first_page ||
2042             (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
2043             return -EINVAL;
2044
2045         kvm_arch_sync_dirty_log(kvm, memslot);
2046
2047         flush = false;
2048         dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2049         if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
2050                 return -EFAULT;
2051
2052         KVM_MMU_LOCK(kvm);
2053         for (offset = log->first_page, i = offset / BITS_PER_LONG,
2054                  n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
2055              i++, offset += BITS_PER_LONG) {
2056                 unsigned long mask = *dirty_bitmap_buffer++;
2057                 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
2058                 if (!mask)
2059                         continue;
2060
2061                 mask &= atomic_long_fetch_andnot(mask, p);
2062
2063                 /*
2064                  * mask contains the bits that really have been cleared.  This
2065                  * never includes any bits beyond the length of the memslot (if
2066                  * the length is not aligned to 64 pages), therefore it is not
2067                  * a problem if userspace sets them in log->dirty_bitmap.
2068                 */
2069                 if (mask) {
2070                         flush = true;
2071                         kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
2072                                                                 offset, mask);
2073                 }
2074         }
2075         KVM_MMU_UNLOCK(kvm);
2076
2077         if (flush)
2078                 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
2079
2080         return 0;
2081 }
2082
2083 static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
2084                                         struct kvm_clear_dirty_log *log)
2085 {
2086         int r;
2087
2088         mutex_lock(&kvm->slots_lock);
2089
2090         r = kvm_clear_dirty_log_protect(kvm, log);
2091
2092         mutex_unlock(&kvm->slots_lock);
2093         return r;
2094 }
2095 #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2096
2097 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
2098 {
2099         return __gfn_to_memslot(kvm_memslots(kvm), gfn);
2100 }
2101 EXPORT_SYMBOL_GPL(gfn_to_memslot);
2102
2103 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
2104 {
2105         struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
2106         struct kvm_memory_slot *slot;
2107         int slot_index;
2108
2109         slot = try_get_memslot(slots, vcpu->last_used_slot, gfn);
2110         if (slot)
2111                 return slot;
2112
2113         /*
2114          * Fall back to searching all memslots. We purposely use
2115          * search_memslots() instead of __gfn_to_memslot() to avoid
2116          * thrashing the VM-wide last_used_index in kvm_memslots.
2117          */
2118         slot = search_memslots(slots, gfn, &slot_index);
2119         if (slot) {
2120                 vcpu->last_used_slot = slot_index;
2121                 return slot;
2122         }
2123
2124         return NULL;
2125 }
2126
2127 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
2128 {
2129         struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
2130
2131         return kvm_is_visible_memslot(memslot);
2132 }
2133 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
2134
2135 bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2136 {
2137         struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2138
2139         return kvm_is_visible_memslot(memslot);
2140 }
2141 EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
2142
2143 unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
2144 {
2145         struct vm_area_struct *vma;
2146         unsigned long addr, size;
2147
2148         size = PAGE_SIZE;
2149
2150         addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
2151         if (kvm_is_error_hva(addr))
2152                 return PAGE_SIZE;
2153
2154         mmap_read_lock(current->mm);
2155         vma = find_vma(current->mm, addr);
2156         if (!vma)
2157                 goto out;
2158
2159         size = vma_kernel_pagesize(vma);
2160
2161 out:
2162         mmap_read_unlock(current->mm);
2163
2164         return size;
2165 }
2166
2167 static bool memslot_is_readonly(struct kvm_memory_slot *slot)
2168 {
2169         return slot->flags & KVM_MEM_READONLY;
2170 }
2171
2172 static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2173                                        gfn_t *nr_pages, bool write)
2174 {
2175         if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
2176                 return KVM_HVA_ERR_BAD;
2177
2178         if (memslot_is_readonly(slot) && write)
2179                 return KVM_HVA_ERR_RO_BAD;
2180
2181         if (nr_pages)
2182                 *nr_pages = slot->npages - (gfn - slot->base_gfn);
2183
2184         return __gfn_to_hva_memslot(slot, gfn);
2185 }
2186
2187 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2188                                      gfn_t *nr_pages)
2189 {
2190         return __gfn_to_hva_many(slot, gfn, nr_pages, true);
2191 }
2192
2193 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
2194                                         gfn_t gfn)
2195 {
2196         return gfn_to_hva_many(slot, gfn, NULL);
2197 }
2198 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
2199
2200 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
2201 {
2202         return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
2203 }
2204 EXPORT_SYMBOL_GPL(gfn_to_hva);
2205
2206 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
2207 {
2208         return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
2209 }
2210 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
2211
2212 /*
2213  * Return the hva of a @gfn and the R/W attribute if possible.
2214  *
2215  * @slot: the kvm_memory_slot which contains @gfn
2216  * @gfn: the gfn to be translated
2217  * @writable: used to return the read/write attribute of the @slot if the hva
2218  * is valid and @writable is not NULL
2219  */
2220 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
2221                                       gfn_t gfn, bool *writable)
2222 {
2223         unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
2224
2225         if (!kvm_is_error_hva(hva) && writable)
2226                 *writable = !memslot_is_readonly(slot);
2227
2228         return hva;
2229 }
2230
2231 unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
2232 {
2233         struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2234
2235         return gfn_to_hva_memslot_prot(slot, gfn, writable);
2236 }
2237
2238 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
2239 {
2240         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2241
2242         return gfn_to_hva_memslot_prot(slot, gfn, writable);
2243 }
2244
2245 static inline int check_user_page_hwpoison(unsigned long addr)
2246 {
2247         int rc, flags = FOLL_HWPOISON | FOLL_WRITE;
2248
2249         rc = get_user_pages(addr, 1, flags, NULL, NULL);
2250         return rc == -EHWPOISON;
2251 }
2252
2253 /*
2254  * The fast path to get the writable pfn which will be stored in @pfn,
2255  * true indicates success, otherwise false is returned.  It's also the
2256  * only part that runs if we can in atomic context.
2257  */
2258 static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
2259                             bool *writable, kvm_pfn_t *pfn)
2260 {
2261         struct page *page[1];
2262
2263         /*
2264          * Fast pin a writable pfn only if it is a write fault request
2265          * or the caller allows to map a writable pfn for a read fault
2266          * request.
2267          */
2268         if (!(write_fault || writable))
2269                 return false;
2270
2271         if (get_user_page_fast_only(addr, FOLL_WRITE, page)) {
2272                 *pfn = page_to_pfn(page[0]);
2273
2274                 if (writable)
2275                         *writable = true;
2276                 return true;
2277         }
2278
2279         return false;
2280 }
2281
2282 /*
2283  * The slow path to get the pfn of the specified host virtual address,
2284  * 1 indicates success, -errno is returned if error is detected.
2285  */
2286 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
2287                            bool *writable, kvm_pfn_t *pfn)
2288 {
2289         unsigned int flags = FOLL_HWPOISON;
2290         struct page *page;
2291         int npages = 0;
2292
2293         might_sleep();
2294
2295         if (writable)
2296                 *writable = write_fault;
2297
2298         if (write_fault)
2299                 flags |= FOLL_WRITE;
2300         if (async)
2301                 flags |= FOLL_NOWAIT;
2302
2303         npages = get_user_pages_unlocked(addr, 1, &page, flags);
2304         if (npages != 1)
2305                 return npages;
2306
2307         /* map read fault as writable if possible */
2308         if (unlikely(!write_fault) && writable) {
2309                 struct page *wpage;
2310
2311                 if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) {
2312                         *writable = true;
2313                         put_page(page);
2314                         page = wpage;
2315                 }
2316         }
2317         *pfn = page_to_pfn(page);
2318         return npages;
2319 }
2320
2321 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
2322 {
2323         if (unlikely(!(vma->vm_flags & VM_READ)))
2324                 return false;
2325
2326         if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
2327                 return false;
2328
2329         return true;
2330 }
2331
2332 static int kvm_try_get_pfn(kvm_pfn_t pfn)
2333 {
2334         if (kvm_is_reserved_pfn(pfn))
2335                 return 1;
2336         return get_page_unless_zero(pfn_to_page(pfn));
2337 }
2338
2339 static int hva_to_pfn_remapped(struct vm_area_struct *vma,
2340                                unsigned long addr, bool *async,
2341                                bool write_fault, bool *writable,
2342                                kvm_pfn_t *p_pfn)
2343 {
2344         kvm_pfn_t pfn;
2345         pte_t *ptep;
2346         spinlock_t *ptl;
2347         int r;
2348
2349         r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2350         if (r) {
2351                 /*
2352                  * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
2353                  * not call the fault handler, so do it here.
2354                  */
2355                 bool unlocked = false;
2356                 r = fixup_user_fault(current->mm, addr,
2357                                      (write_fault ? FAULT_FLAG_WRITE : 0),
2358                                      &unlocked);
2359                 if (unlocked)
2360                         return -EAGAIN;
2361                 if (r)
2362                         return r;
2363
2364                 r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
2365                 if (r)
2366                         return r;
2367         }
2368
2369         if (write_fault && !pte_write(*ptep)) {
2370                 pfn = KVM_PFN_ERR_RO_FAULT;
2371                 goto out;
2372         }
2373
2374         if (writable)
2375                 *writable = pte_write(*ptep);
2376         pfn = pte_pfn(*ptep);
2377
2378         /*
2379          * Get a reference here because callers of *hva_to_pfn* and
2380          * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
2381          * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
2382          * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
2383          * simply do nothing for reserved pfns.
2384          *
2385          * Whoever called remap_pfn_range is also going to call e.g.
2386          * unmap_mapping_range before the underlying pages are freed,
2387          * causing a call to our MMU notifier.
2388          *
2389          * Certain IO or PFNMAP mappings can be backed with valid
2390          * struct pages, but be allocated without refcounting e.g.,
2391          * tail pages of non-compound higher order allocations, which
2392          * would then underflow the refcount when the caller does the
2393          * required put_page. Don't allow those pages here.
2394          */ 
2395         if (!kvm_try_get_pfn(pfn))
2396                 r = -EFAULT;
2397
2398 out:
2399         pte_unmap_unlock(ptep, ptl);
2400         *p_pfn = pfn;
2401
2402         return r;
2403 }
2404
2405 /*
2406  * Pin guest page in memory and return its pfn.
2407  * @addr: host virtual address which maps memory to the guest
2408  * @atomic: whether this function can sleep
2409  * @async: whether this function need to wait IO complete if the
2410  *         host page is not in the memory
2411  * @write_fault: whether we should get a writable host page
2412  * @writable: whether it allows to map a writable host page for !@write_fault
2413  *
2414  * The function will map a writable host page for these two cases:
2415  * 1): @write_fault = true
2416  * 2): @write_fault = false && @writable, @writable will tell the caller
2417  *     whether the mapping is writable.
2418  */
2419 static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
2420                         bool write_fault, bool *writable)
2421 {
2422         struct vm_area_struct *vma;
2423         kvm_pfn_t pfn = 0;
2424         int npages, r;
2425
2426         /* we can do it either atomically or asynchronously, not both */
2427         BUG_ON(atomic && async);
2428
2429         if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
2430                 return pfn;
2431
2432         if (atomic)
2433                 return KVM_PFN_ERR_FAULT;
2434
2435         npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
2436         if (npages == 1)
2437                 return pfn;
2438
2439         mmap_read_lock(current->mm);
2440         if (npages == -EHWPOISON ||
2441               (!async && check_user_page_hwpoison(addr))) {
2442                 pfn = KVM_PFN_ERR_HWPOISON;
2443                 goto exit;
2444         }
2445
2446 retry:
2447         vma = vma_lookup(current->mm, addr);
2448
2449         if (vma == NULL)
2450                 pfn = KVM_PFN_ERR_FAULT;
2451         else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
2452                 r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn);
2453                 if (r == -EAGAIN)
2454                         goto retry;
2455                 if (r < 0)
2456                         pfn = KVM_PFN_ERR_FAULT;
2457         } else {
2458                 if (async && vma_is_valid(vma, write_fault))
2459                         *async = true;
2460                 pfn = KVM_PFN_ERR_FAULT;
2461         }
2462 exit:
2463         mmap_read_unlock(current->mm);
2464         return pfn;
2465 }
2466
2467 kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
2468                                bool atomic, bool *async, bool write_fault,
2469                                bool *writable, hva_t *hva)
2470 {
2471         unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
2472
2473         if (hva)
2474                 *hva = addr;
2475
2476         if (addr == KVM_HVA_ERR_RO_BAD) {
2477                 if (writable)
2478                         *writable = false;
2479                 return KVM_PFN_ERR_RO_FAULT;
2480         }
2481
2482         if (kvm_is_error_hva(addr)) {
2483                 if (writable)
2484                         *writable = false;
2485                 return KVM_PFN_NOSLOT;
2486         }
2487
2488         /* Do not map writable pfn in the readonly memslot. */
2489         if (writable && memslot_is_readonly(slot)) {
2490                 *writable = false;
2491                 writable = NULL;
2492         }
2493
2494         return hva_to_pfn(addr, atomic, async, write_fault,
2495                           writable);
2496 }
2497 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
2498
2499 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
2500                       bool *writable)
2501 {
2502         return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL,
2503                                     write_fault, writable, NULL);
2504 }
2505 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
2506
2507 kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
2508 {
2509         return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL);
2510 }
2511 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
2512
2513 kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
2514 {
2515         return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL);
2516 }
2517 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
2518
2519 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
2520 {
2521         return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2522 }
2523 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
2524
2525 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
2526 {
2527         return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
2528 }
2529 EXPORT_SYMBOL_GPL(gfn_to_pfn);
2530
2531 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2532 {
2533         return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2534 }
2535 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
2536
2537 int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2538                             struct page **pages, int nr_pages)
2539 {
2540         unsigned long addr;
2541         gfn_t entry = 0;
2542
2543         addr = gfn_to_hva_many(slot, gfn, &entry);
2544         if (kvm_is_error_hva(addr))
2545                 return -1;
2546
2547         if (entry < nr_pages)
2548                 return 0;
2549
2550         return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
2551 }
2552 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
2553
2554 static struct page *kvm_pfn_to_page(kvm_pfn_t pfn)
2555 {
2556         if (is_error_noslot_pfn(pfn))
2557                 return KVM_ERR_PTR_BAD_PAGE;
2558
2559         if (kvm_is_reserved_pfn(pfn)) {
2560                 WARN_ON(1);
2561                 return KVM_ERR_PTR_BAD_PAGE;
2562         }
2563
2564         return pfn_to_page(pfn);
2565 }
2566
2567 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
2568 {
2569         kvm_pfn_t pfn;
2570
2571         pfn = gfn_to_pfn(kvm, gfn);
2572
2573         return kvm_pfn_to_page(pfn);
2574 }
2575 EXPORT_SYMBOL_GPL(gfn_to_page);
2576
2577 void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache)
2578 {
2579         if (pfn == 0)
2580                 return;
2581
2582         if (cache)
2583                 cache->pfn = cache->gfn = 0;
2584
2585         if (dirty)
2586                 kvm_release_pfn_dirty(pfn);
2587         else
2588                 kvm_release_pfn_clean(pfn);
2589 }
2590
2591 static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn,
2592                                  struct gfn_to_pfn_cache *cache, u64 gen)
2593 {
2594         kvm_release_pfn(cache->pfn, cache->dirty, cache);
2595
2596         cache->pfn = gfn_to_pfn_memslot(slot, gfn);
2597         cache->gfn = gfn;
2598         cache->dirty = false;
2599         cache->generation = gen;
2600 }
2601
2602 static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn,
2603                          struct kvm_host_map *map,
2604                          struct gfn_to_pfn_cache *cache,
2605                          bool atomic)
2606 {
2607         kvm_pfn_t pfn;
2608         void *hva = NULL;
2609         struct page *page = KVM_UNMAPPED_PAGE;
2610         struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn);
2611         u64 gen = slots->generation;
2612
2613         if (!map)
2614                 return -EINVAL;
2615
2616         if (cache) {
2617                 if (!cache->pfn || cache->gfn != gfn ||
2618                         cache->generation != gen) {
2619                         if (atomic)
2620                                 return -EAGAIN;
2621                         kvm_cache_gfn_to_pfn(slot, gfn, cache, gen);
2622                 }
2623                 pfn = cache->pfn;
2624         } else {
2625                 if (atomic)
2626                         return -EAGAIN;
2627                 pfn = gfn_to_pfn_memslot(slot, gfn);
2628         }
2629         if (is_error_noslot_pfn(pfn))
2630                 return -EINVAL;
2631
2632         if (pfn_valid(pfn)) {
2633                 page = pfn_to_page(pfn);
2634                 if (atomic)
2635                         hva = kmap_atomic(page);
2636                 else
2637                         hva = kmap(page);
2638 #ifdef CONFIG_HAS_IOMEM
2639         } else if (!atomic) {
2640                 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
2641         } else {
2642                 return -EINVAL;
2643 #endif
2644         }
2645
2646         if (!hva)
2647                 return -EFAULT;
2648
2649         map->page = page;
2650         map->hva = hva;
2651         map->pfn = pfn;
2652         map->gfn = gfn;
2653
2654         return 0;
2655 }
2656
2657 int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
2658                 struct gfn_to_pfn_cache *cache, bool atomic)
2659 {
2660         return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map,
2661                         cache, atomic);
2662 }
2663 EXPORT_SYMBOL_GPL(kvm_map_gfn);
2664
2665 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
2666 {
2667         return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map,
2668                 NULL, false);
2669 }
2670 EXPORT_SYMBOL_GPL(kvm_vcpu_map);
2671
2672 static void __kvm_unmap_gfn(struct kvm *kvm,
2673                         struct kvm_memory_slot *memslot,
2674                         struct kvm_host_map *map,
2675                         struct gfn_to_pfn_cache *cache,
2676                         bool dirty, bool atomic)
2677 {
2678         if (!map)
2679                 return;
2680
2681         if (!map->hva)
2682                 return;
2683
2684         if (map->page != KVM_UNMAPPED_PAGE) {
2685                 if (atomic)
2686                         kunmap_atomic(map->hva);
2687                 else
2688                         kunmap(map->page);
2689         }
2690 #ifdef CONFIG_HAS_IOMEM
2691         else if (!atomic)
2692                 memunmap(map->hva);
2693         else
2694                 WARN_ONCE(1, "Unexpected unmapping in atomic context");
2695 #endif
2696
2697         if (dirty)
2698                 mark_page_dirty_in_slot(kvm, memslot, map->gfn);
2699
2700         if (cache)
2701                 cache->dirty |= dirty;
2702         else
2703                 kvm_release_pfn(map->pfn, dirty, NULL);
2704
2705         map->hva = NULL;
2706         map->page = NULL;
2707 }
2708
2709 int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, 
2710                   struct gfn_to_pfn_cache *cache, bool dirty, bool atomic)
2711 {
2712         __kvm_unmap_gfn(vcpu->kvm, gfn_to_memslot(vcpu->kvm, map->gfn), map,
2713                         cache, dirty, atomic);
2714         return 0;
2715 }
2716 EXPORT_SYMBOL_GPL(kvm_unmap_gfn);
2717
2718 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
2719 {
2720         __kvm_unmap_gfn(vcpu->kvm, kvm_vcpu_gfn_to_memslot(vcpu, map->gfn),
2721                         map, NULL, dirty, false);
2722 }
2723 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
2724
2725 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2726 {
2727         kvm_pfn_t pfn;
2728
2729         pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn);
2730
2731         return kvm_pfn_to_page(pfn);
2732 }
2733 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page);
2734
2735 void kvm_release_page_clean(struct page *page)
2736 {
2737         WARN_ON(is_error_page(page));
2738
2739         kvm_release_pfn_clean(page_to_pfn(page));
2740 }
2741 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
2742
2743 void kvm_release_pfn_clean(kvm_pfn_t pfn)
2744 {
2745         if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
2746                 put_page(pfn_to_page(pfn));
2747 }
2748 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
2749
2750 void kvm_release_page_dirty(struct page *page)
2751 {
2752         WARN_ON(is_error_page(page));
2753
2754         kvm_release_pfn_dirty(page_to_pfn(page));
2755 }
2756 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
2757
2758 void kvm_release_pfn_dirty(kvm_pfn_t pfn)
2759 {
2760         kvm_set_pfn_dirty(pfn);
2761         kvm_release_pfn_clean(pfn);
2762 }
2763 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
2764
2765 void kvm_set_pfn_dirty(kvm_pfn_t pfn)
2766 {
2767         if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2768                 SetPageDirty(pfn_to_page(pfn));
2769 }
2770 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
2771
2772 void kvm_set_pfn_accessed(kvm_pfn_t pfn)
2773 {
2774         if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
2775                 mark_page_accessed(pfn_to_page(pfn));
2776 }
2777 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
2778
2779 static int next_segment(unsigned long len, int offset)
2780 {
2781         if (len > PAGE_SIZE - offset)
2782                 return PAGE_SIZE - offset;
2783         else
2784                 return len;
2785 }
2786
2787 static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
2788                                  void *data, int offset, int len)
2789 {
2790         int r;
2791         unsigned long addr;
2792
2793         addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
2794         if (kvm_is_error_hva(addr))
2795                 return -EFAULT;
2796         r = __copy_from_user(data, (void __user *)addr + offset, len);
2797         if (r)
2798                 return -EFAULT;
2799         return 0;
2800 }
2801
2802 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
2803                         int len)
2804 {
2805         struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2806
2807         return __kvm_read_guest_page(slot, gfn, data, offset, len);
2808 }
2809 EXPORT_SYMBOL_GPL(kvm_read_guest_page);
2810
2811 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
2812                              int offset, int len)
2813 {
2814         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2815
2816         return __kvm_read_guest_page(slot, gfn, data, offset, len);
2817 }
2818 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
2819
2820 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
2821 {
2822         gfn_t gfn = gpa >> PAGE_SHIFT;
2823         int seg;
2824         int offset = offset_in_page(gpa);
2825         int ret;
2826
2827         while ((seg = next_segment(len, offset)) != 0) {
2828                 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
2829                 if (ret < 0)
2830                         return ret;
2831                 offset = 0;
2832                 len -= seg;
2833                 data += seg;
2834                 ++gfn;
2835         }
2836         return 0;
2837 }
2838 EXPORT_SYMBOL_GPL(kvm_read_guest);
2839
2840 int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
2841 {
2842         gfn_t gfn = gpa >> PAGE_SHIFT;
2843         int seg;
2844         int offset = offset_in_page(gpa);
2845         int ret;
2846
2847         while ((seg = next_segment(len, offset)) != 0) {
2848                 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
2849                 if (ret < 0)
2850                         return ret;
2851                 offset = 0;
2852                 len -= seg;
2853                 data += seg;
2854                 ++gfn;
2855         }
2856         return 0;
2857 }
2858 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
2859
2860 static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2861                                    void *data, int offset, unsigned long len)
2862 {
2863         int r;
2864         unsigned long addr;
2865
2866         addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
2867         if (kvm_is_error_hva(addr))
2868                 return -EFAULT;
2869         pagefault_disable();
2870         r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
2871         pagefault_enable();
2872         if (r)
2873                 return -EFAULT;
2874         return 0;
2875 }
2876
2877 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
2878                                void *data, unsigned long len)
2879 {
2880         gfn_t gfn = gpa >> PAGE_SHIFT;
2881         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2882         int offset = offset_in_page(gpa);
2883
2884         return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
2885 }
2886 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
2887
2888 static int __kvm_write_guest_page(struct kvm *kvm,
2889                                   struct kvm_memory_slot *memslot, gfn_t gfn,
2890                                   const void *data, int offset, int len)
2891 {
2892         int r;
2893         unsigned long addr;
2894
2895         addr = gfn_to_hva_memslot(memslot, gfn);
2896         if (kvm_is_error_hva(addr))
2897                 return -EFAULT;
2898         r = __copy_to_user((void __user *)addr + offset, data, len);
2899         if (r)
2900                 return -EFAULT;
2901         mark_page_dirty_in_slot(kvm, memslot, gfn);
2902         return 0;
2903 }
2904
2905 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
2906                          const void *data, int offset, int len)
2907 {
2908         struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2909
2910         return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
2911 }
2912 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
2913
2914 int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
2915                               const void *data, int offset, int len)
2916 {
2917         struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2918
2919         return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
2920 }
2921 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
2922
2923 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
2924                     unsigned long len)
2925 {
2926         gfn_t gfn = gpa >> PAGE_SHIFT;
2927         int seg;
2928         int offset = offset_in_page(gpa);
2929         int ret;
2930
2931         while ((seg = next_segment(len, offset)) != 0) {
2932                 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
2933                 if (ret < 0)
2934                         return ret;
2935                 offset = 0;
2936                 len -= seg;
2937                 data += seg;
2938                 ++gfn;
2939         }
2940         return 0;
2941 }
2942 EXPORT_SYMBOL_GPL(kvm_write_guest);
2943
2944 int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
2945                          unsigned long len)
2946 {
2947         gfn_t gfn = gpa >> PAGE_SHIFT;
2948         int seg;
2949         int offset = offset_in_page(gpa);
2950         int ret;
2951
2952         while ((seg = next_segment(len, offset)) != 0) {
2953                 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
2954                 if (ret < 0)
2955                         return ret;
2956                 offset = 0;
2957                 len -= seg;
2958                 data += seg;
2959                 ++gfn;
2960         }
2961         return 0;
2962 }
2963 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
2964
2965 static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
2966                                        struct gfn_to_hva_cache *ghc,
2967                                        gpa_t gpa, unsigned long len)
2968 {
2969         int offset = offset_in_page(gpa);
2970         gfn_t start_gfn = gpa >> PAGE_SHIFT;
2971         gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
2972         gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
2973         gfn_t nr_pages_avail;
2974
2975         /* Update ghc->generation before performing any error checks. */
2976         ghc->generation = slots->generation;
2977
2978         if (start_gfn > end_gfn) {
2979                 ghc->hva = KVM_HVA_ERR_BAD;
2980                 return -EINVAL;
2981         }
2982
2983         /*
2984          * If the requested region crosses two memslots, we still
2985          * verify that the entire region is valid here.
2986          */
2987         for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
2988                 ghc->memslot = __gfn_to_memslot(slots, start_gfn);
2989                 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
2990                                            &nr_pages_avail);
2991                 if (kvm_is_error_hva(ghc->hva))
2992                         return -EFAULT;
2993         }
2994
2995         /* Use the slow path for cross page reads and writes. */
2996         if (nr_pages_needed == 1)
2997                 ghc->hva += offset;
2998         else
2999                 ghc->memslot = NULL;
3000
3001         ghc->gpa = gpa;
3002         ghc->len = len;
3003         return 0;
3004 }
3005
3006 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3007                               gpa_t gpa, unsigned long len)
3008 {
3009         struct kvm_memslots *slots = kvm_memslots(kvm);
3010         return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
3011 }
3012 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
3013
3014 int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3015                                   void *data, unsigned int offset,
3016                                   unsigned long len)
3017 {
3018         struct kvm_memslots *slots = kvm_memslots(kvm);
3019         int r;
3020         gpa_t gpa = ghc->gpa + offset;
3021
3022         if (WARN_ON_ONCE(len + offset > ghc->len))
3023                 return -EINVAL;
3024
3025         if (slots->generation != ghc->generation) {
3026                 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3027                         return -EFAULT;
3028         }
3029
3030         if (kvm_is_error_hva(ghc->hva))
3031                 return -EFAULT;
3032
3033         if (unlikely(!ghc->memslot))
3034                 return kvm_write_guest(kvm, gpa, data, len);
3035
3036         r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
3037         if (r)
3038                 return -EFAULT;
3039         mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);
3040
3041         return 0;
3042 }
3043 EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
3044
3045 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3046                            void *data, unsigned long len)
3047 {
3048         return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
3049 }
3050 EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
3051
3052 int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3053                                  void *data, unsigned int offset,
3054                                  unsigned long len)
3055 {
3056         struct kvm_memslots *slots = kvm_memslots(kvm);
3057         int r;
3058         gpa_t gpa = ghc->gpa + offset;
3059
3060         if (WARN_ON_ONCE(len + offset > ghc->len))
3061                 return -EINVAL;
3062
3063         if (slots->generation != ghc->generation) {
3064                 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
3065                         return -EFAULT;
3066         }
3067
3068         if (kvm_is_error_hva(ghc->hva))
3069                 return -EFAULT;
3070
3071         if (unlikely(!ghc->memslot))
3072                 return kvm_read_guest(kvm, gpa, data, len);
3073
3074         r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
3075         if (r)
3076                 return -EFAULT;
3077
3078         return 0;
3079 }
3080 EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
3081
3082 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
3083                           void *data, unsigned long len)
3084 {
3085         return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
3086 }
3087 EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
3088
3089 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
3090 {
3091         const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
3092         gfn_t gfn = gpa >> PAGE_SHIFT;
3093         int seg;
3094         int offset = offset_in_page(gpa);
3095         int ret;
3096
3097         while ((seg = next_segment(len, offset)) != 0) {
3098                 ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
3099                 if (ret < 0)
3100                         return ret;
3101                 offset = 0;
3102                 len -= seg;
3103                 ++gfn;
3104         }
3105         return 0;
3106 }
3107 EXPORT_SYMBOL_GPL(kvm_clear_guest);
3108
3109 void mark_page_dirty_in_slot(struct kvm *kvm,
3110                              struct kvm_memory_slot *memslot,
3111                              gfn_t gfn)
3112 {
3113         if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
3114                 unsigned long rel_gfn = gfn - memslot->base_gfn;
3115                 u32 slot = (memslot->as_id << 16) | memslot->id;
3116
3117                 if (kvm->dirty_ring_size)
3118                         kvm_dirty_ring_push(kvm_dirty_ring_get(kvm),
3119                                             slot, rel_gfn);
3120                 else
3121                         set_bit_le(rel_gfn, memslot->dirty_bitmap);
3122         }
3123 }
3124 EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
3125
3126 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
3127 {
3128         struct kvm_memory_slot *memslot;
3129
3130         memslot = gfn_to_memslot(kvm, gfn);
3131         mark_page_dirty_in_slot(kvm, memslot, gfn);
3132 }
3133 EXPORT_SYMBOL_GPL(mark_page_dirty);
3134
3135 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
3136 {
3137         struct kvm_memory_slot *memslot;
3138
3139         memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3140         mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
3141 }
3142 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
3143
3144 void kvm_sigset_activate(struct kvm_vcpu *vcpu)
3145 {
3146         if (!vcpu->sigset_active)
3147                 return;
3148
3149         /*
3150          * This does a lockless modification of ->real_blocked, which is fine
3151          * because, only current can change ->real_blocked and all readers of
3152          * ->real_blocked don't care as long ->real_blocked is always a subset
3153          * of ->blocked.
3154          */
3155         sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
3156 }
3157
3158 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
3159 {
3160         if (!vcpu->sigset_active)
3161                 return;
3162
3163         sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
3164         sigemptyset(&current->real_blocked);
3165 }
3166
3167 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
3168 {
3169         unsigned int old, val, grow, grow_start;
3170
3171         old = val = vcpu->halt_poll_ns;
3172         grow_start = READ_ONCE(halt_poll_ns_grow_start);
3173         grow = READ_ONCE(halt_poll_ns_grow);
3174         if (!grow)
3175                 goto out;
3176
3177         val *= grow;
3178         if (val < grow_start)
3179                 val = grow_start;
3180
3181         if (val > vcpu->kvm->max_halt_poll_ns)
3182                 val = vcpu->kvm->max_halt_poll_ns;
3183
3184         vcpu->halt_poll_ns = val;
3185 out:
3186         trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
3187 }
3188
3189 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
3190 {
3191         unsigned int old, val, shrink, grow_start;
3192
3193         old = val = vcpu->halt_poll_ns;
3194         shrink = READ_ONCE(halt_poll_ns_shrink);
3195         grow_start = READ_ONCE(halt_poll_ns_grow_start);
3196         if (shrink == 0)
3197                 val = 0;
3198         else
3199                 val /= shrink;
3200
3201         if (val < grow_start)
3202                 val = 0;
3203
3204         vcpu->halt_poll_ns = val;
3205         trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
3206 }
3207
3208 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
3209 {
3210         int ret = -EINTR;
3211         int idx = srcu_read_lock(&vcpu->kvm->srcu);
3212
3213         if (kvm_arch_vcpu_runnable(vcpu)) {
3214                 kvm_make_request(KVM_REQ_UNHALT, vcpu);
3215                 goto out;
3216         }
3217         if (kvm_cpu_has_pending_timer(vcpu))
3218                 goto out;
3219         if (signal_pending(current))
3220                 goto out;
3221         if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
3222                 goto out;
3223
3224         ret = 0;
3225 out:
3226         srcu_read_unlock(&vcpu->kvm->srcu, idx);
3227         return ret;
3228 }
3229
3230 static inline void
3231 update_halt_poll_stats(struct kvm_vcpu *vcpu, u64 poll_ns, bool waited)
3232 {
3233         if (waited)
3234                 vcpu->stat.generic.halt_poll_fail_ns += poll_ns;
3235         else
3236                 vcpu->stat.generic.halt_poll_success_ns += poll_ns;
3237 }
3238
3239 /*
3240  * The vCPU has executed a HLT instruction with in-kernel mode enabled.
3241  */
3242 void kvm_vcpu_block(struct kvm_vcpu *vcpu)
3243 {
3244         bool halt_poll_allowed = !kvm_arch_no_poll(vcpu);
3245         ktime_t start, cur, poll_end;
3246         bool waited = false;
3247         u64 block_ns;
3248
3249         kvm_arch_vcpu_blocking(vcpu);
3250
3251         start = cur = poll_end = ktime_get();
3252         if (vcpu->halt_poll_ns && halt_poll_allowed) {
3253                 ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
3254
3255                 ++vcpu->stat.generic.halt_attempted_poll;
3256                 do {
3257                         /*
3258                          * This sets KVM_REQ_UNHALT if an interrupt
3259                          * arrives.
3260                          */
3261                         if (kvm_vcpu_check_block(vcpu) < 0) {
3262                                 ++vcpu->stat.generic.halt_successful_poll;
3263                                 if (!vcpu_valid_wakeup(vcpu))
3264                                         ++vcpu->stat.generic.halt_poll_invalid;
3265
3266                                 KVM_STATS_LOG_HIST_UPDATE(
3267                                       vcpu->stat.generic.halt_poll_success_hist,
3268                                       ktime_to_ns(ktime_get()) -
3269                                       ktime_to_ns(start));
3270                                 goto out;
3271                         }
3272                         cpu_relax();
3273                         poll_end = cur = ktime_get();
3274                 } while (kvm_vcpu_can_poll(cur, stop));
3275
3276                 KVM_STATS_LOG_HIST_UPDATE(
3277                                 vcpu->stat.generic.halt_poll_fail_hist,
3278                                 ktime_to_ns(ktime_get()) - ktime_to_ns(start));
3279         }
3280
3281
3282         prepare_to_rcuwait(&vcpu->wait);
3283         for (;;) {
3284                 set_current_state(TASK_INTERRUPTIBLE);
3285
3286                 if (kvm_vcpu_check_block(vcpu) < 0)
3287                         break;
3288
3289                 waited = true;
3290                 schedule();
3291         }
3292         finish_rcuwait(&vcpu->wait);
3293         cur = ktime_get();
3294         if (waited) {
3295                 vcpu->stat.generic.halt_wait_ns +=
3296                         ktime_to_ns(cur) - ktime_to_ns(poll_end);
3297                 KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
3298                                 ktime_to_ns(cur) - ktime_to_ns(poll_end));
3299         }
3300 out:
3301         kvm_arch_vcpu_unblocking(vcpu);
3302         block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
3303
3304         update_halt_poll_stats(
3305                 vcpu, ktime_to_ns(ktime_sub(poll_end, start)), waited);
3306
3307         if (halt_poll_allowed) {
3308                 if (!vcpu_valid_wakeup(vcpu)) {
3309                         shrink_halt_poll_ns(vcpu);
3310                 } else if (vcpu->kvm->max_halt_poll_ns) {
3311                         if (block_ns <= vcpu->halt_poll_ns)
3312                                 ;
3313                         /* we had a long block, shrink polling */
3314                         else if (vcpu->halt_poll_ns &&
3315                                         block_ns > vcpu->kvm->max_halt_poll_ns)
3316                                 shrink_halt_poll_ns(vcpu);
3317                         /* we had a short halt and our poll time is too small */
3318                         else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns &&
3319                                         block_ns < vcpu->kvm->max_halt_poll_ns)
3320                                 grow_halt_poll_ns(vcpu);
3321                 } else {
3322                         vcpu->halt_poll_ns = 0;
3323                 }
3324         }
3325
3326         trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
3327         kvm_arch_vcpu_block_finish(vcpu);
3328 }
3329 EXPORT_SYMBOL_GPL(kvm_vcpu_block);
3330
3331 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
3332 {
3333         struct rcuwait *waitp;
3334
3335         waitp = kvm_arch_vcpu_get_wait(vcpu);
3336         if (rcuwait_wake_up(waitp)) {
3337                 WRITE_ONCE(vcpu->ready, true);
3338                 ++vcpu->stat.generic.halt_wakeup;
3339                 return true;
3340         }
3341
3342         return false;
3343 }
3344 EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
3345
3346 #ifndef CONFIG_S390
3347 /*
3348  * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
3349  */
3350 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3351 {
3352         int me, cpu;
3353
3354         if (kvm_vcpu_wake_up(vcpu))
3355                 return;
3356
3357         /*
3358          * Note, the vCPU could get migrated to a different pCPU at any point
3359          * after kvm_arch_vcpu_should_kick(), which could result in sending an
3360          * IPI to the previous pCPU.  But, that's ok because the purpose of the
3361          * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
3362          * vCPU also requires it to leave IN_GUEST_MODE.
3363          */
3364         me = get_cpu();
3365         if (kvm_arch_vcpu_should_kick(vcpu)) {
3366                 cpu = READ_ONCE(vcpu->cpu);
3367                 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
3368                         smp_send_reschedule(cpu);
3369         }
3370         put_cpu();
3371 }
3372 EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
3373 #endif /* !CONFIG_S390 */
3374
3375 int kvm_vcpu_yield_to(struct kvm_vcpu *target)
3376 {
3377         struct pid *pid;
3378         struct task_struct *task = NULL;
3379         int ret = 0;
3380
3381         rcu_read_lock();
3382         pid = rcu_dereference(target->pid);
3383         if (pid)
3384                 task = get_pid_task(pid, PIDTYPE_PID);
3385         rcu_read_unlock();
3386         if (!task)
3387                 return ret;
3388         ret = yield_to(task, 1);
3389         put_task_struct(task);
3390
3391         return ret;
3392 }
3393 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
3394
3395 /*
3396  * Helper that checks whether a VCPU is eligible for directed yield.
3397  * Most eligible candidate to yield is decided by following heuristics:
3398  *
3399  *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently
3400  *  (preempted lock holder), indicated by @in_spin_loop.
3401  *  Set at the beginning and cleared at the end of interception/PLE handler.
3402  *
3403  *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
3404  *  chance last time (mostly it has become eligible now since we have probably
3405  *  yielded to lockholder in last iteration. This is done by toggling
3406  *  @dy_eligible each time a VCPU checked for eligibility.)
3407  *
3408  *  Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
3409  *  to preempted lock-holder could result in wrong VCPU selection and CPU
3410  *  burning. Giving priority for a potential lock-holder increases lock
3411  *  progress.
3412  *
3413  *  Since algorithm is based on heuristics, accessing another VCPU data without
3414  *  locking does not harm. It may result in trying to yield to  same VCPU, fail
3415  *  and continue with next VCPU and so on.
3416  */
3417 static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
3418 {
3419 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
3420         bool eligible;
3421
3422         eligible = !vcpu->spin_loop.in_spin_loop ||
3423                     vcpu->spin_loop.dy_eligible;
3424
3425         if (vcpu->spin_loop.in_spin_loop)
3426                 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
3427
3428         return eligible;
3429 #else
3430         return true;
3431 #endif
3432 }
3433
3434 /*
3435  * Unlike kvm_arch_vcpu_runnable, this function is called outside
3436  * a vcpu_load/vcpu_put pair.  However, for most architectures
3437  * kvm_arch_vcpu_runnable does not require vcpu_load.
3438  */
3439 bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
3440 {
3441         return kvm_arch_vcpu_runnable(vcpu);
3442 }
3443
3444 static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
3445 {
3446         if (kvm_arch_dy_runnable(vcpu))
3447                 return true;
3448
3449 #ifdef CONFIG_KVM_ASYNC_PF
3450         if (!list_empty_careful(&vcpu->async_pf.done))
3451                 return true;
3452 #endif
3453
3454         return false;
3455 }
3456
3457 bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
3458 {
3459         return false;
3460 }
3461
3462 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
3463 {
3464         struct kvm *kvm = me->kvm;
3465         struct kvm_vcpu *vcpu;
3466         int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
3467         int yielded = 0;
3468         int try = 3;
3469         int pass;
3470         int i;
3471
3472         kvm_vcpu_set_in_spin_loop(me, true);
3473         /*
3474          * We boost the priority of a VCPU that is runnable but not
3475          * currently running, because it got preempted by something
3476          * else and called schedule in __vcpu_run.  Hopefully that
3477          * VCPU is holding the lock that we need and will release it.
3478          * We approximate round-robin by starting at the last boosted VCPU.
3479          */
3480         for (pass = 0; pass < 2 && !yielded && try; pass++) {
3481                 kvm_for_each_vcpu(i, vcpu, kvm) {
3482                         if (!pass && i <= last_boosted_vcpu) {
3483                                 i = last_boosted_vcpu;
3484                                 continue;
3485                         } else if (pass && i > last_boosted_vcpu)
3486                                 break;
3487                         if (!READ_ONCE(vcpu->ready))
3488                                 continue;
3489                         if (vcpu == me)
3490                                 continue;
3491                         if (rcuwait_active(&vcpu->wait) &&
3492                             !vcpu_dy_runnable(vcpu))
3493                                 continue;
3494                         if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
3495                             !kvm_arch_dy_has_pending_interrupt(vcpu) &&
3496                             !kvm_arch_vcpu_in_kernel(vcpu))
3497                                 continue;
3498                         if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
3499                                 continue;
3500
3501                         yielded = kvm_vcpu_yield_to(vcpu);
3502                         if (yielded > 0) {
3503                                 kvm->last_boosted_vcpu = i;
3504                                 break;
3505                         } else if (yielded < 0) {
3506                                 try--;
3507                                 if (!try)
3508                                         break;
3509                         }
3510                 }
3511         }
3512         kvm_vcpu_set_in_spin_loop(me, false);
3513
3514         /* Ensure vcpu is not eligible during next spinloop */
3515         kvm_vcpu_set_dy_eligible(me, false);
3516 }
3517 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
3518
3519 static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
3520 {
3521 #if KVM_DIRTY_LOG_PAGE_OFFSET > 0
3522         return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
3523             (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
3524              kvm->dirty_ring_size / PAGE_SIZE);
3525 #else
3526         return false;
3527 #endif
3528 }
3529
3530 static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
3531 {
3532         struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
3533         struct page *page;
3534
3535         if (vmf->pgoff == 0)
3536                 page = virt_to_page(vcpu->run);
3537 #ifdef CONFIG_X86
3538         else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
3539                 page = virt_to_page(vcpu->arch.pio_data);
3540 #endif
3541 #ifdef CONFIG_KVM_MMIO
3542         else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
3543                 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
3544 #endif
3545         else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
3546                 page = kvm_dirty_ring_get_page(
3547                     &vcpu->dirty_ring,
3548                     vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
3549         else
3550                 return kvm_arch_vcpu_fault(vcpu, vmf);
3551         get_page(page);
3552         vmf->page = page;
3553         return 0;
3554 }
3555
3556 static const struct vm_operations_struct kvm_vcpu_vm_ops = {
3557         .fault = kvm_vcpu_fault,
3558 };
3559
3560 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
3561 {
3562         struct kvm_vcpu *vcpu = file->private_data;
3563         unsigned long pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
3564
3565         if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
3566              kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
3567             ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
3568                 return -EINVAL;
3569
3570         vma->vm_ops = &kvm_vcpu_vm_ops;
3571         return 0;
3572 }
3573
3574 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
3575 {
3576         struct kvm_vcpu *vcpu = filp->private_data;
3577
3578         kvm_put_kvm(vcpu->kvm);
3579         return 0;
3580 }
3581
3582 static struct file_operations kvm_vcpu_fops = {
3583         .release        = kvm_vcpu_release,
3584         .unlocked_ioctl = kvm_vcpu_ioctl,
3585         .mmap           = kvm_vcpu_mmap,
3586         .llseek         = noop_llseek,
3587         KVM_COMPAT(kvm_vcpu_compat_ioctl),
3588 };
3589
3590 /*
3591  * Allocates an inode for the vcpu.
3592  */
3593 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
3594 {
3595         char name[8 + 1 + ITOA_MAX_LEN + 1];
3596
3597         snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
3598         return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
3599 }
3600
3601 static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
3602 {
3603 #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
3604         struct dentry *debugfs_dentry;
3605         char dir_name[ITOA_MAX_LEN * 2];
3606
3607         if (!debugfs_initialized())
3608                 return;
3609
3610         snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
3611         debugfs_dentry = debugfs_create_dir(dir_name,
3612                                             vcpu->kvm->debugfs_dentry);
3613
3614         kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
3615 #endif
3616 }
3617
3618 /*
3619  * Creates some virtual cpus.  Good luck creating more than one.
3620  */
3621 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
3622 {
3623         int r;
3624         struct kvm_vcpu *vcpu;
3625         struct page *page;
3626
3627         if (id >= KVM_MAX_VCPU_ID)
3628                 return -EINVAL;
3629
3630         mutex_lock(&kvm->lock);
3631         if (kvm->created_vcpus == KVM_MAX_VCPUS) {
3632                 mutex_unlock(&kvm->lock);
3633                 return -EINVAL;
3634         }
3635
3636         kvm->created_vcpus++;
3637         mutex_unlock(&kvm->lock);
3638
3639         r = kvm_arch_vcpu_precreate(kvm, id);
3640         if (r)
3641                 goto vcpu_decrement;
3642
3643         vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
3644         if (!vcpu) {
3645                 r = -ENOMEM;
3646                 goto vcpu_decrement;
3647         }
3648
3649         BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
3650         page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
3651         if (!page) {
3652                 r = -ENOMEM;
3653                 goto vcpu_free;
3654         }
3655         vcpu->run = page_address(page);
3656
3657         kvm_vcpu_init(vcpu, kvm, id);
3658
3659         r = kvm_arch_vcpu_create(vcpu);
3660         if (r)
3661                 goto vcpu_free_run_page;
3662
3663         if (kvm->dirty_ring_size) {
3664                 r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
3665                                          id, kvm->dirty_ring_size);
3666                 if (r)
3667                         goto arch_vcpu_destroy;
3668         }
3669
3670         mutex_lock(&kvm->lock);
3671         if (kvm_get_vcpu_by_id(kvm, id)) {
3672                 r = -EEXIST;
3673                 goto unlock_vcpu_destroy;
3674         }
3675
3676         vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
3677         BUG_ON(kvm->vcpus[vcpu->vcpu_idx]);
3678
3679         /* Fill the stats id string for the vcpu */
3680         snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
3681                  task_pid_nr(current), id);
3682
3683         /* Now it's all set up, let userspace reach it */
3684         kvm_get_kvm(kvm);
3685         r = create_vcpu_fd(vcpu);
3686         if (r < 0) {
3687                 kvm_put_kvm_no_destroy(kvm);
3688                 goto unlock_vcpu_destroy;
3689         }
3690
3691         kvm->vcpus[vcpu->vcpu_idx] = vcpu;
3692
3693         /*
3694          * Pairs with smp_rmb() in kvm_get_vcpu.  Write kvm->vcpus
3695          * before kvm->online_vcpu's incremented value.
3696          */
3697         smp_wmb();
3698         atomic_inc(&kvm->online_vcpus);
3699
3700         mutex_unlock(&kvm->lock);
3701         kvm_arch_vcpu_postcreate(vcpu);
3702         kvm_create_vcpu_debugfs(vcpu);
3703         return r;
3704
3705 unlock_vcpu_destroy:
3706         mutex_unlock(&kvm->lock);
3707         kvm_dirty_ring_free(&vcpu->dirty_ring);
3708 arch_vcpu_destroy:
3709         kvm_arch_vcpu_destroy(vcpu);
3710 vcpu_free_run_page:
3711         free_page((unsigned long)vcpu->run);
3712 vcpu_free:
3713         kmem_cache_free(kvm_vcpu_cache, vcpu);
3714 vcpu_decrement:
3715         mutex_lock(&kvm->lock);
3716         kvm->created_vcpus--;
3717         mutex_unlock(&kvm->lock);
3718         return r;
3719 }
3720
3721 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
3722 {
3723         if (sigset) {
3724                 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
3725                 vcpu->sigset_active = 1;
3726                 vcpu->sigset = *sigset;
3727         } else
3728                 vcpu->sigset_active = 0;
3729         return 0;
3730 }
3731
3732 static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
3733                               size_t size, loff_t *offset)
3734 {
3735         struct kvm_vcpu *vcpu = file->private_data;
3736
3737         return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
3738                         &kvm_vcpu_stats_desc[0], &vcpu->stat,
3739                         sizeof(vcpu->stat), user_buffer, size, offset);
3740 }
3741
3742 static const struct file_operations kvm_vcpu_stats_fops = {
3743         .read = kvm_vcpu_stats_read,
3744         .llseek = noop_llseek,
3745 };
3746
3747 static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
3748 {
3749         int fd;
3750         struct file *file;
3751         char name[15 + ITOA_MAX_LEN + 1];
3752
3753         snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);
3754
3755         fd = get_unused_fd_flags(O_CLOEXEC);
3756         if (fd < 0)
3757                 return fd;
3758
3759         file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
3760         if (IS_ERR(file)) {
3761                 put_unused_fd(fd);
3762                 return PTR_ERR(file);
3763         }
3764         file->f_mode |= FMODE_PREAD;
3765         fd_install(fd, file);
3766
3767         return fd;
3768 }
3769
3770 static long kvm_vcpu_ioctl(struct file *filp,
3771                            unsigned int ioctl, unsigned long arg)
3772 {
3773         struct kvm_vcpu *vcpu = filp->private_data;
3774         void __user *argp = (void __user *)arg;
3775         int r;
3776         struct kvm_fpu *fpu = NULL;
3777         struct kvm_sregs *kvm_sregs = NULL;
3778
3779         if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged)
3780                 return -EIO;
3781
3782         if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
3783                 return -EINVAL;
3784
3785         /*
3786          * Some architectures have vcpu ioctls that are asynchronous to vcpu
3787          * execution; mutex_lock() would break them.
3788          */
3789         r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
3790         if (r != -ENOIOCTLCMD)
3791                 return r;
3792
3793         if (mutex_lock_killable(&vcpu->mutex))
3794                 return -EINTR;
3795         switch (ioctl) {
3796         case KVM_RUN: {
3797                 struct pid *oldpid;
3798                 r = -EINVAL;
3799                 if (arg)
3800                         goto out;
3801                 oldpid = rcu_access_pointer(vcpu->pid);
3802                 if (unlikely(oldpid != task_pid(current))) {
3803                         /* The thread running this VCPU changed. */
3804                         struct pid *newpid;
3805
3806                         r = kvm_arch_vcpu_run_pid_change(vcpu);
3807                         if (r)
3808                                 break;
3809
3810                         newpid = get_task_pid(current, PIDTYPE_PID);
3811                         rcu_assign_pointer(vcpu->pid, newpid);
3812                         if (oldpid)
3813                                 synchronize_rcu();
3814                         put_pid(oldpid);
3815                 }
3816                 r = kvm_arch_vcpu_ioctl_run(vcpu);
3817                 trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
3818                 break;
3819         }
3820         case KVM_GET_REGS: {
3821                 struct kvm_regs *kvm_regs;
3822
3823                 r = -ENOMEM;
3824                 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
3825                 if (!kvm_regs)
3826                         goto out;
3827                 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
3828                 if (r)
3829                         goto out_free1;
3830                 r = -EFAULT;
3831                 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
3832                         goto out_free1;
3833                 r = 0;
3834 out_free1:
3835                 kfree(kvm_regs);
3836                 break;
3837         }
3838         case KVM_SET_REGS: {
3839                 struct kvm_regs *kvm_regs;
3840
3841                 kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
3842                 if (IS_ERR(kvm_regs)) {
3843                         r = PTR_ERR(kvm_regs);
3844                         goto out;
3845                 }
3846                 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
3847                 kfree(kvm_regs);
3848                 break;
3849         }
3850         case KVM_GET_SREGS: {
3851                 kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
3852                                     GFP_KERNEL_ACCOUNT);
3853                 r = -ENOMEM;
3854                 if (!kvm_sregs)
3855                         goto out;
3856                 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
3857                 if (r)
3858                         goto out;
3859                 r = -EFAULT;
3860                 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
3861                         goto out;
3862                 r = 0;
3863                 break;
3864         }
3865         case KVM_SET_SREGS: {
3866                 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
3867                 if (IS_ERR(kvm_sregs)) {
3868                         r = PTR_ERR(kvm_sregs);
3869                         kvm_sregs = NULL;
3870                         goto out;
3871                 }
3872                 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
3873                 break;
3874         }
3875         case KVM_GET_MP_STATE: {
3876                 struct kvm_mp_state mp_state;
3877
3878                 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
3879                 if (r)
3880                         goto out;
3881                 r = -EFAULT;
3882                 if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
3883                         goto out;
3884                 r = 0;
3885                 break;
3886         }
3887         case KVM_SET_MP_STATE: {
3888                 struct kvm_mp_state mp_state;
3889
3890                 r = -EFAULT;
3891                 if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
3892                         goto out;
3893                 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
3894                 break;
3895         }
3896         case KVM_TRANSLATE: {
3897                 struct kvm_translation tr;
3898
3899                 r = -EFAULT;
3900                 if (copy_from_user(&tr, argp, sizeof(tr)))
3901                         goto out;
3902                 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
3903                 if (r)
3904                         goto out;
3905                 r = -EFAULT;
3906                 if (copy_to_user(argp, &tr, sizeof(tr)))
3907                         goto out;
3908                 r = 0;
3909                 break;
3910         }
3911         case KVM_SET_GUEST_DEBUG: {
3912                 struct kvm_guest_debug dbg;
3913
3914                 r = -EFAULT;
3915                 if (copy_from_user(&dbg, argp, sizeof(dbg)))
3916                         goto out;
3917                 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
3918                 break;
3919         }
3920         case KVM_SET_SIGNAL_MASK: {
3921                 struct kvm_signal_mask __user *sigmask_arg = argp;
3922                 struct kvm_signal_mask kvm_sigmask;
3923                 sigset_t sigset, *p;
3924
3925                 p = NULL;
3926                 if (argp) {
3927                         r = -EFAULT;
3928                         if (copy_from_user(&kvm_sigmask, argp,
3929                                            sizeof(kvm_sigmask)))
3930                                 goto out;
3931                         r = -EINVAL;
3932                         if (kvm_sigmask.len != sizeof(sigset))
3933                                 goto out;
3934                         r = -EFAULT;
3935                         if (copy_from_user(&sigset, sigmask_arg->sigset,
3936                                            sizeof(sigset)))
3937                                 goto out;
3938                         p = &sigset;
3939                 }
3940                 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
3941                 break;
3942         }
3943         case KVM_GET_FPU: {
3944                 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
3945                 r = -ENOMEM;
3946                 if (!fpu)
3947                         goto out;
3948                 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
3949                 if (r)
3950                         goto out;
3951                 r = -EFAULT;
3952                 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
3953                         goto out;
3954                 r = 0;
3955                 break;
3956         }
3957         case KVM_SET_FPU: {
3958                 fpu = memdup_user(argp, sizeof(*fpu));
3959                 if (IS_ERR(fpu)) {
3960                         r = PTR_ERR(fpu);
3961                         fpu = NULL;
3962                         goto out;
3963                 }
3964                 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
3965                 break;
3966         }
3967         case KVM_GET_STATS_FD: {
3968                 r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
3969                 break;
3970         }
3971         default:
3972                 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
3973         }
3974 out:
3975         mutex_unlock(&vcpu->mutex);
3976         kfree(fpu);
3977         kfree(kvm_sregs);
3978         return r;
3979 }
3980
3981 #ifdef CONFIG_KVM_COMPAT
3982 static long kvm_vcpu_compat_ioctl(struct file *filp,
3983                                   unsigned int ioctl, unsigned long arg)
3984 {
3985         struct kvm_vcpu *vcpu = filp->private_data;
3986         void __user *argp = compat_ptr(arg);
3987         int r;
3988
3989         if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged)
3990                 return -EIO;
3991
3992         switch (ioctl) {
3993         case KVM_SET_SIGNAL_MASK: {
3994                 struct kvm_signal_mask __user *sigmask_arg = argp;
3995                 struct kvm_signal_mask kvm_sigmask;
3996                 sigset_t sigset;
3997
3998                 if (argp) {
3999                         r = -EFAULT;
4000                         if (copy_from_user(&kvm_sigmask, argp,
4001                                            sizeof(kvm_sigmask)))
4002                                 goto out;
4003                         r = -EINVAL;
4004                         if (kvm_sigmask.len != sizeof(compat_sigset_t))
4005                                 goto out;
4006                         r = -EFAULT;
4007                         if (get_compat_sigset(&sigset,
4008                                               (compat_sigset_t __user *)sigmask_arg->sigset))
4009                                 goto out;
4010                         r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
4011                 } else
4012                         r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
4013                 break;
4014         }
4015         default:
4016                 r = kvm_vcpu_ioctl(filp, ioctl, arg);
4017         }
4018
4019 out:
4020         return r;
4021 }
4022 #endif
4023
4024 static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
4025 {
4026         struct kvm_device *dev = filp->private_data;
4027
4028         if (dev->ops->mmap)
4029                 return dev->ops->mmap(dev, vma);
4030
4031         return -ENODEV;
4032 }
4033
4034 static int kvm_device_ioctl_attr(struct kvm_device *dev,
4035                                  int (*accessor)(struct kvm_device *dev,
4036                                                  struct kvm_device_attr *attr),
4037                                  unsigned long arg)
4038 {
4039         struct kvm_device_attr attr;
4040
4041         if (!accessor)
4042                 return -EPERM;
4043
4044         if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
4045                 return -EFAULT;
4046
4047         return accessor(dev, &attr);
4048 }
4049
4050 static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
4051                              unsigned long arg)
4052 {
4053         struct kvm_device *dev = filp->private_data;
4054
4055         if (dev->kvm->mm != current->mm || dev->kvm->vm_bugged)
4056                 return -EIO;
4057
4058         switch (ioctl) {
4059         case KVM_SET_DEVICE_ATTR:
4060                 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
4061         case KVM_GET_DEVICE_ATTR:
4062                 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
4063         case KVM_HAS_DEVICE_ATTR:
4064                 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
4065         default:
4066                 if (dev->ops->ioctl)
4067                         return dev->ops->ioctl(dev, ioctl, arg);
4068
4069                 return -ENOTTY;
4070         }
4071 }
4072
4073 static int kvm_device_release(struct inode *inode, struct file *filp)
4074 {
4075         struct kvm_device *dev = filp->private_data;
4076         struct kvm *kvm = dev->kvm;
4077
4078         if (dev->ops->release) {
4079                 mutex_lock(&kvm->lock);
4080                 list_del(&dev->vm_node);
4081                 dev->ops->release(dev);
4082                 mutex_unlock(&kvm->lock);
4083         }
4084
4085         kvm_put_kvm(kvm);
4086         return 0;
4087 }
4088
4089 static const struct file_operations kvm_device_fops = {
4090         .unlocked_ioctl = kvm_device_ioctl,
4091         .release = kvm_device_release,
4092         KVM_COMPAT(kvm_device_ioctl),
4093         .mmap = kvm_device_mmap,
4094 };
4095
4096 struct kvm_device *kvm_device_from_filp(struct file *filp)
4097 {
4098         if (filp->f_op != &kvm_device_fops)
4099                 return NULL;
4100
4101         return filp->private_data;
4102 }
4103
4104 static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
4105 #ifdef CONFIG_KVM_MPIC
4106         [KVM_DEV_TYPE_FSL_MPIC_20]      = &kvm_mpic_ops,
4107         [KVM_DEV_TYPE_FSL_MPIC_42]      = &kvm_mpic_ops,
4108 #endif
4109 };
4110
4111 int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
4112 {
4113         if (type >= ARRAY_SIZE(kvm_device_ops_table))
4114                 return -ENOSPC;
4115
4116         if (kvm_device_ops_table[type] != NULL)
4117                 return -EEXIST;
4118
4119         kvm_device_ops_table[type] = ops;
4120         return 0;
4121 }
4122
4123 void kvm_unregister_device_ops(u32 type)
4124 {
4125         if (kvm_device_ops_table[type] != NULL)
4126                 kvm_device_ops_table[type] = NULL;
4127 }
4128
4129 static int kvm_ioctl_create_device(struct kvm *kvm,
4130                                    struct kvm_create_device *cd)
4131 {
4132         const struct kvm_device_ops *ops = NULL;
4133         struct kvm_device *dev;
4134         bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
4135         int type;
4136         int ret;
4137
4138         if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
4139                 return -ENODEV;
4140
4141         type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
4142         ops = kvm_device_ops_table[type];
4143         if (ops == NULL)
4144                 return -ENODEV;
4145
4146         if (test)
4147                 return 0;
4148
4149         dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
4150         if (!dev)
4151                 return -ENOMEM;
4152
4153         dev->ops = ops;
4154         dev->kvm = kvm;
4155
4156         mutex_lock(&kvm->lock);
4157         ret = ops->create(dev, type);
4158         if (ret < 0) {
4159                 mutex_unlock(&kvm->lock);
4160                 kfree(dev);
4161                 return ret;
4162         }
4163         list_add(&dev->vm_node, &kvm->devices);
4164         mutex_unlock(&kvm->lock);
4165
4166         if (ops->init)
4167                 ops->init(dev);
4168
4169         kvm_get_kvm(kvm);
4170         ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
4171         if (ret < 0) {
4172                 kvm_put_kvm_no_destroy(kvm);
4173                 mutex_lock(&kvm->lock);
4174                 list_del(&dev->vm_node);
4175                 mutex_unlock(&kvm->lock);
4176                 ops->destroy(dev);
4177                 return ret;
4178         }
4179
4180         cd->fd = ret;
4181         return 0;
4182 }
4183
4184 static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
4185 {
4186         switch (arg) {
4187         case KVM_CAP_USER_MEMORY:
4188         case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
4189         case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
4190         case KVM_CAP_INTERNAL_ERROR_DATA:
4191 #ifdef CONFIG_HAVE_KVM_MSI
4192         case KVM_CAP_SIGNAL_MSI:
4193 #endif
4194 #ifdef CONFIG_HAVE_KVM_IRQFD
4195         case KVM_CAP_IRQFD:
4196         case KVM_CAP_IRQFD_RESAMPLE:
4197 #endif
4198         case KVM_CAP_IOEVENTFD_ANY_LENGTH:
4199         case KVM_CAP_CHECK_EXTENSION_VM:
4200         case KVM_CAP_ENABLE_CAP_VM:
4201         case KVM_CAP_HALT_POLL:
4202                 return 1;
4203 #ifdef CONFIG_KVM_MMIO
4204         case KVM_CAP_COALESCED_MMIO:
4205                 return KVM_COALESCED_MMIO_PAGE_OFFSET;
4206         case KVM_CAP_COALESCED_PIO:
4207                 return 1;
4208 #endif
4209 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4210         case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
4211                 return KVM_DIRTY_LOG_MANUAL_CAPS;
4212 #endif
4213 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4214         case KVM_CAP_IRQ_ROUTING:
4215                 return KVM_MAX_IRQ_ROUTES;
4216 #endif
4217 #if KVM_ADDRESS_SPACE_NUM > 1
4218         case KVM_CAP_MULTI_ADDRESS_SPACE:
4219                 return KVM_ADDRESS_SPACE_NUM;
4220 #endif
4221         case KVM_CAP_NR_MEMSLOTS:
4222                 return KVM_USER_MEM_SLOTS;
4223         case KVM_CAP_DIRTY_LOG_RING:
4224 #if KVM_DIRTY_LOG_PAGE_OFFSET > 0
4225                 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4226 #else
4227                 return 0;
4228 #endif
4229         case KVM_CAP_BINARY_STATS_FD:
4230                 return 1;
4231         default:
4232                 break;
4233         }
4234         return kvm_vm_ioctl_check_extension(kvm, arg);
4235 }
4236
4237 static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
4238 {
4239         int r;
4240
4241         if (!KVM_DIRTY_LOG_PAGE_OFFSET)
4242                 return -EINVAL;
4243
4244         /* the size should be power of 2 */
4245         if (!size || (size & (size - 1)))
4246                 return -EINVAL;
4247
4248         /* Should be bigger to keep the reserved entries, or a page */
4249         if (size < kvm_dirty_ring_get_rsvd_entries() *
4250             sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
4251                 return -EINVAL;
4252
4253         if (size > KVM_DIRTY_RING_MAX_ENTRIES *
4254             sizeof(struct kvm_dirty_gfn))
4255                 return -E2BIG;
4256
4257         /* We only allow it to set once */
4258         if (kvm->dirty_ring_size)
4259                 return -EINVAL;
4260
4261         mutex_lock(&kvm->lock);
4262
4263         if (kvm->created_vcpus) {
4264                 /* We don't allow to change this value after vcpu created */
4265                 r = -EINVAL;
4266         } else {
4267                 kvm->dirty_ring_size = size;
4268                 r = 0;
4269         }
4270
4271         mutex_unlock(&kvm->lock);
4272         return r;
4273 }
4274
4275 static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
4276 {
4277         int i;
4278         struct kvm_vcpu *vcpu;
4279         int cleared = 0;
4280
4281         if (!kvm->dirty_ring_size)
4282                 return -EINVAL;
4283
4284         mutex_lock(&kvm->slots_lock);
4285
4286         kvm_for_each_vcpu(i, vcpu, kvm)
4287                 cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);
4288
4289         mutex_unlock(&kvm->slots_lock);
4290
4291         if (cleared)
4292                 kvm_flush_remote_tlbs(kvm);
4293
4294         return cleared;
4295 }
4296
4297 int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4298                                                   struct kvm_enable_cap *cap)
4299 {
4300         return -EINVAL;
4301 }
4302
4303 static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
4304                                            struct kvm_enable_cap *cap)
4305 {
4306         switch (cap->cap) {
4307 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4308         case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
4309                 u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
4310
4311                 if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
4312                         allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
4313
4314                 if (cap->flags || (cap->args[0] & ~allowed_options))
4315                         return -EINVAL;
4316                 kvm->manual_dirty_log_protect = cap->args[0];
4317                 return 0;
4318         }
4319 #endif
4320         case KVM_CAP_HALT_POLL: {
4321                 if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
4322                         return -EINVAL;
4323
4324                 kvm->max_halt_poll_ns = cap->args[0];
4325                 return 0;
4326         }
4327         case KVM_CAP_DIRTY_LOG_RING:
4328                 return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
4329         default:
4330                 return kvm_vm_ioctl_enable_cap(kvm, cap);
4331         }
4332 }
4333
4334 static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
4335                               size_t size, loff_t *offset)
4336 {
4337         struct kvm *kvm = file->private_data;
4338
4339         return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
4340                                 &kvm_vm_stats_desc[0], &kvm->stat,
4341                                 sizeof(kvm->stat), user_buffer, size, offset);
4342 }
4343
4344 static const struct file_operations kvm_vm_stats_fops = {
4345         .read = kvm_vm_stats_read,
4346         .llseek = noop_llseek,
4347 };
4348
4349 static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
4350 {
4351         int fd;
4352         struct file *file;
4353
4354         fd = get_unused_fd_flags(O_CLOEXEC);
4355         if (fd < 0)
4356                 return fd;
4357
4358         file = anon_inode_getfile("kvm-vm-stats",
4359                         &kvm_vm_stats_fops, kvm, O_RDONLY);
4360         if (IS_ERR(file)) {
4361                 put_unused_fd(fd);
4362                 return PTR_ERR(file);
4363         }
4364         file->f_mode |= FMODE_PREAD;
4365         fd_install(fd, file);
4366
4367         return fd;
4368 }
4369
4370 static long kvm_vm_ioctl(struct file *filp,
4371                            unsigned int ioctl, unsigned long arg)
4372 {
4373         struct kvm *kvm = filp->private_data;
4374         void __user *argp = (void __user *)arg;
4375         int r;
4376
4377         if (kvm->mm != current->mm || kvm->vm_bugged)
4378                 return -EIO;
4379         switch (ioctl) {
4380         case KVM_CREATE_VCPU:
4381                 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
4382                 break;
4383         case KVM_ENABLE_CAP: {
4384                 struct kvm_enable_cap cap;
4385
4386                 r = -EFAULT;
4387                 if (copy_from_user(&cap, argp, sizeof(cap)))
4388                         goto out;
4389                 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
4390                 break;
4391         }
4392         case KVM_SET_USER_MEMORY_REGION: {
4393                 struct kvm_userspace_memory_region kvm_userspace_mem;
4394
4395                 r = -EFAULT;
4396                 if (copy_from_user(&kvm_userspace_mem, argp,
4397                                                 sizeof(kvm_userspace_mem)))
4398                         goto out;
4399
4400                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
4401                 break;
4402         }
4403         case KVM_GET_DIRTY_LOG: {
4404                 struct kvm_dirty_log log;
4405
4406                 r = -EFAULT;
4407                 if (copy_from_user(&log, argp, sizeof(log)))
4408                         goto out;
4409                 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
4410                 break;
4411         }
4412 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4413         case KVM_CLEAR_DIRTY_LOG: {
4414                 struct kvm_clear_dirty_log log;
4415
4416                 r = -EFAULT;
4417                 if (copy_from_user(&log, argp, sizeof(log)))
4418                         goto out;
4419                 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4420                 break;
4421         }
4422 #endif
4423 #ifdef CONFIG_KVM_MMIO
4424         case KVM_REGISTER_COALESCED_MMIO: {
4425                 struct kvm_coalesced_mmio_zone zone;
4426
4427                 r = -EFAULT;
4428                 if (copy_from_user(&zone, argp, sizeof(zone)))
4429                         goto out;
4430                 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
4431                 break;
4432         }
4433         case KVM_UNREGISTER_COALESCED_MMIO: {
4434                 struct kvm_coalesced_mmio_zone zone;
4435
4436                 r = -EFAULT;
4437                 if (copy_from_user(&zone, argp, sizeof(zone)))
4438                         goto out;
4439                 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
4440                 break;
4441         }
4442 #endif
4443         case KVM_IRQFD: {
4444                 struct kvm_irqfd data;
4445
4446                 r = -EFAULT;
4447                 if (copy_from_user(&data, argp, sizeof(data)))
4448                         goto out;
4449                 r = kvm_irqfd(kvm, &data);
4450                 break;
4451         }
4452         case KVM_IOEVENTFD: {
4453                 struct kvm_ioeventfd data;
4454
4455                 r = -EFAULT;
4456                 if (copy_from_user(&data, argp, sizeof(data)))
4457                         goto out;
4458                 r = kvm_ioeventfd(kvm, &data);
4459                 break;
4460         }
4461 #ifdef CONFIG_HAVE_KVM_MSI
4462         case KVM_SIGNAL_MSI: {
4463                 struct kvm_msi msi;
4464
4465                 r = -EFAULT;
4466                 if (copy_from_user(&msi, argp, sizeof(msi)))
4467                         goto out;
4468                 r = kvm_send_userspace_msi(kvm, &msi);
4469                 break;
4470         }
4471 #endif
4472 #ifdef __KVM_HAVE_IRQ_LINE
4473         case KVM_IRQ_LINE_STATUS:
4474         case KVM_IRQ_LINE: {
4475                 struct kvm_irq_level irq_event;
4476
4477                 r = -EFAULT;
4478                 if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
4479                         goto out;
4480
4481                 r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
4482                                         ioctl == KVM_IRQ_LINE_STATUS);
4483                 if (r)
4484                         goto out;
4485
4486                 r = -EFAULT;
4487                 if (ioctl == KVM_IRQ_LINE_STATUS) {
4488                         if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
4489                                 goto out;
4490                 }
4491
4492                 r = 0;
4493                 break;
4494         }
4495 #endif
4496 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4497         case KVM_SET_GSI_ROUTING: {
4498                 struct kvm_irq_routing routing;
4499                 struct kvm_irq_routing __user *urouting;
4500                 struct kvm_irq_routing_entry *entries = NULL;
4501
4502                 r = -EFAULT;
4503                 if (copy_from_user(&routing, argp, sizeof(routing)))
4504                         goto out;
4505                 r = -EINVAL;
4506                 if (!kvm_arch_can_set_irq_routing(kvm))
4507                         goto out;
4508                 if (routing.nr > KVM_MAX_IRQ_ROUTES)
4509                         goto out;
4510                 if (routing.flags)
4511                         goto out;
4512                 if (routing.nr) {
4513                         urouting = argp;
4514                         entries = vmemdup_user(urouting->entries,
4515                                                array_size(sizeof(*entries),
4516                                                           routing.nr));
4517                         if (IS_ERR(entries)) {
4518                                 r = PTR_ERR(entries);
4519                                 goto out;
4520                         }
4521                 }
4522                 r = kvm_set_irq_routing(kvm, entries, routing.nr,
4523                                         routing.flags);
4524                 kvfree(entries);
4525                 break;
4526         }
4527 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
4528         case KVM_CREATE_DEVICE: {
4529                 struct kvm_create_device cd;
4530
4531                 r = -EFAULT;
4532                 if (copy_from_user(&cd, argp, sizeof(cd)))
4533                         goto out;
4534
4535                 r = kvm_ioctl_create_device(kvm, &cd);
4536                 if (r)
4537                         goto out;
4538
4539                 r = -EFAULT;
4540                 if (copy_to_user(argp, &cd, sizeof(cd)))
4541                         goto out;
4542
4543                 r = 0;
4544                 break;
4545         }
4546         case KVM_CHECK_EXTENSION:
4547                 r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
4548                 break;
4549         case KVM_RESET_DIRTY_RINGS:
4550                 r = kvm_vm_ioctl_reset_dirty_pages(kvm);
4551                 break;
4552         case KVM_GET_STATS_FD:
4553                 r = kvm_vm_ioctl_get_stats_fd(kvm);
4554                 break;
4555         default:
4556                 r = kvm_arch_vm_ioctl(filp, ioctl, arg);
4557         }
4558 out:
4559         return r;
4560 }
4561
4562 #ifdef CONFIG_KVM_COMPAT
4563 struct compat_kvm_dirty_log {
4564         __u32 slot;
4565         __u32 padding1;
4566         union {
4567                 compat_uptr_t dirty_bitmap; /* one bit per page */
4568                 __u64 padding2;
4569         };
4570 };
4571
4572 struct compat_kvm_clear_dirty_log {
4573         __u32 slot;
4574         __u32 num_pages;
4575         __u64 first_page;
4576         union {
4577                 compat_uptr_t dirty_bitmap; /* one bit per page */
4578                 __u64 padding2;
4579         };
4580 };
4581
4582 static long kvm_vm_compat_ioctl(struct file *filp,
4583                            unsigned int ioctl, unsigned long arg)
4584 {
4585         struct kvm *kvm = filp->private_data;
4586         int r;
4587
4588         if (kvm->mm != current->mm || kvm->vm_bugged)
4589                 return -EIO;
4590         switch (ioctl) {
4591 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4592         case KVM_CLEAR_DIRTY_LOG: {
4593                 struct compat_kvm_clear_dirty_log compat_log;
4594                 struct kvm_clear_dirty_log log;
4595
4596                 if (copy_from_user(&compat_log, (void __user *)arg,
4597                                    sizeof(compat_log)))
4598                         return -EFAULT;
4599                 log.slot         = compat_log.slot;
4600                 log.num_pages    = compat_log.num_pages;
4601                 log.first_page   = compat_log.first_page;
4602                 log.padding2     = compat_log.padding2;
4603                 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4604
4605                 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
4606                 break;
4607         }
4608 #endif
4609         case KVM_GET_DIRTY_LOG: {
4610                 struct compat_kvm_dirty_log compat_log;
4611                 struct kvm_dirty_log log;
4612
4613                 if (copy_from_user(&compat_log, (void __user *)arg,
4614                                    sizeof(compat_log)))
4615                         return -EFAULT;
4616                 log.slot         = compat_log.slot;
4617                 log.padding1     = compat_log.padding1;
4618                 log.padding2     = compat_log.padding2;
4619                 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
4620
4621                 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
4622                 break;
4623         }
4624         default:
4625                 r = kvm_vm_ioctl(filp, ioctl, arg);
4626         }
4627         return r;
4628 }
4629 #endif
4630
4631 static struct file_operations kvm_vm_fops = {
4632         .release        = kvm_vm_release,
4633         .unlocked_ioctl = kvm_vm_ioctl,
4634         .llseek         = noop_llseek,
4635         KVM_COMPAT(kvm_vm_compat_ioctl),
4636 };
4637
4638 bool file_is_kvm(struct file *file)
4639 {
4640         return file && file->f_op == &kvm_vm_fops;
4641 }
4642 EXPORT_SYMBOL_GPL(file_is_kvm);
4643
4644 static int kvm_dev_ioctl_create_vm(unsigned long type)
4645 {
4646         int r;
4647         struct kvm *kvm;
4648         struct file *file;
4649
4650         kvm = kvm_create_vm(type);
4651         if (IS_ERR(kvm))
4652                 return PTR_ERR(kvm);
4653 #ifdef CONFIG_KVM_MMIO
4654         r = kvm_coalesced_mmio_init(kvm);
4655         if (r < 0)
4656                 goto put_kvm;
4657 #endif
4658         r = get_unused_fd_flags(O_CLOEXEC);
4659         if (r < 0)
4660                 goto put_kvm;
4661
4662         snprintf(kvm->stats_id, sizeof(kvm->stats_id),
4663                         "kvm-%d", task_pid_nr(current));
4664
4665         file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
4666         if (IS_ERR(file)) {
4667                 put_unused_fd(r);
4668                 r = PTR_ERR(file);
4669                 goto put_kvm;
4670         }
4671
4672         /*
4673          * Don't call kvm_put_kvm anymore at this point; file->f_op is
4674          * already set, with ->release() being kvm_vm_release().  In error
4675          * cases it will be called by the final fput(file) and will take
4676          * care of doing kvm_put_kvm(kvm).
4677          */
4678         if (kvm_create_vm_debugfs(kvm, r) < 0) {
4679                 put_unused_fd(r);
4680                 fput(file);
4681                 return -ENOMEM;
4682         }
4683         kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
4684
4685         fd_install(r, file);
4686         return r;
4687
4688 put_kvm:
4689         kvm_put_kvm(kvm);
4690         return r;
4691 }
4692
4693 static long kvm_dev_ioctl(struct file *filp,
4694                           unsigned int ioctl, unsigned long arg)
4695 {
4696         long r = -EINVAL;
4697
4698         switch (ioctl) {
4699         case KVM_GET_API_VERSION:
4700                 if (arg)
4701                         goto out;
4702                 r = KVM_API_VERSION;
4703                 break;
4704         case KVM_CREATE_VM:
4705                 r = kvm_dev_ioctl_create_vm(arg);
4706                 break;
4707         case KVM_CHECK_EXTENSION:
4708                 r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
4709                 break;
4710         case KVM_GET_VCPU_MMAP_SIZE:
4711                 if (arg)
4712                         goto out;
4713                 r = PAGE_SIZE;     /* struct kvm_run */
4714 #ifdef CONFIG_X86
4715                 r += PAGE_SIZE;    /* pio data page */
4716 #endif
4717 #ifdef CONFIG_KVM_MMIO
4718                 r += PAGE_SIZE;    /* coalesced mmio ring page */
4719 #endif
4720                 break;
4721         case KVM_TRACE_ENABLE:
4722         case KVM_TRACE_PAUSE:
4723         case KVM_TRACE_DISABLE:
4724                 r = -EOPNOTSUPP;
4725                 break;
4726         default:
4727                 return kvm_arch_dev_ioctl(filp, ioctl, arg);
4728         }
4729 out:
4730         return r;
4731 }
4732
4733 static struct file_operations kvm_chardev_ops = {
4734         .unlocked_ioctl = kvm_dev_ioctl,
4735         .llseek         = noop_llseek,
4736         KVM_COMPAT(kvm_dev_ioctl),
4737 };
4738
4739 static struct miscdevice kvm_dev = {
4740         KVM_MINOR,
4741         "kvm",
4742         &kvm_chardev_ops,
4743 };
4744
4745 static void hardware_enable_nolock(void *junk)
4746 {
4747         int cpu = raw_smp_processor_id();
4748         int r;
4749
4750         if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
4751                 return;
4752
4753         cpumask_set_cpu(cpu, cpus_hardware_enabled);
4754
4755         r = kvm_arch_hardware_enable();
4756
4757         if (r) {
4758                 cpumask_clear_cpu(cpu, cpus_hardware_enabled);
4759                 atomic_inc(&hardware_enable_failed);
4760                 pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu);
4761         }
4762 }
4763
4764 static int kvm_starting_cpu(unsigned int cpu)
4765 {
4766         raw_spin_lock(&kvm_count_lock);
4767         if (kvm_usage_count)
4768                 hardware_enable_nolock(NULL);
4769         raw_spin_unlock(&kvm_count_lock);
4770         return 0;
4771 }
4772
4773 static void hardware_disable_nolock(void *junk)
4774 {
4775         int cpu = raw_smp_processor_id();
4776
4777         if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
4778                 return;
4779         cpumask_clear_cpu(cpu, cpus_hardware_enabled);
4780         kvm_arch_hardware_disable();
4781 }
4782
4783 static int kvm_dying_cpu(unsigned int cpu)
4784 {
4785         raw_spin_lock(&kvm_count_lock);
4786         if (kvm_usage_count)
4787                 hardware_disable_nolock(NULL);
4788         raw_spin_unlock(&kvm_count_lock);
4789         return 0;
4790 }
4791
4792 static void hardware_disable_all_nolock(void)
4793 {
4794         BUG_ON(!kvm_usage_count);
4795
4796         kvm_usage_count--;
4797         if (!kvm_usage_count)
4798                 on_each_cpu(hardware_disable_nolock, NULL, 1);
4799 }
4800
4801 static void hardware_disable_all(void)
4802 {
4803         raw_spin_lock(&kvm_count_lock);
4804         hardware_disable_all_nolock();
4805         raw_spin_unlock(&kvm_count_lock);
4806 }
4807
4808 static int hardware_enable_all(void)
4809 {
4810         int r = 0;
4811
4812         raw_spin_lock(&kvm_count_lock);
4813
4814         kvm_usage_count++;
4815         if (kvm_usage_count == 1) {
4816                 atomic_set(&hardware_enable_failed, 0);
4817                 on_each_cpu(hardware_enable_nolock, NULL, 1);
4818
4819                 if (atomic_read(&hardware_enable_failed)) {
4820                         hardware_disable_all_nolock();
4821                         r = -EBUSY;
4822                 }
4823         }
4824
4825         raw_spin_unlock(&kvm_count_lock);
4826
4827         return r;
4828 }
4829
4830 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
4831                       void *v)
4832 {
4833         /*
4834          * Some (well, at least mine) BIOSes hang on reboot if
4835          * in vmx root mode.
4836          *
4837          * And Intel TXT required VMX off for all cpu when system shutdown.
4838          */
4839         pr_info("kvm: exiting hardware virtualization\n");
4840         kvm_rebooting = true;
4841         on_each_cpu(hardware_disable_nolock, NULL, 1);
4842         return NOTIFY_OK;
4843 }
4844
4845 static struct notifier_block kvm_reboot_notifier = {
4846         .notifier_call = kvm_reboot,
4847         .priority = 0,
4848 };
4849
4850 static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
4851 {
4852         int i;
4853
4854         for (i = 0; i < bus->dev_count; i++) {
4855                 struct kvm_io_device *pos = bus->range[i].dev;
4856
4857                 kvm_iodevice_destructor(pos);
4858         }
4859         kfree(bus);
4860 }
4861
4862 static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
4863                                  const struct kvm_io_range *r2)
4864 {
4865         gpa_t addr1 = r1->addr;
4866         gpa_t addr2 = r2->addr;
4867
4868         if (addr1 < addr2)
4869                 return -1;
4870
4871         /* If r2->len == 0, match the exact address.  If r2->len != 0,
4872          * accept any overlapping write.  Any order is acceptable for
4873          * overlapping ranges, because kvm_io_bus_get_first_dev ensures
4874          * we process all of them.
4875          */
4876         if (r2->len) {
4877                 addr1 += r1->len;
4878                 addr2 += r2->len;
4879         }
4880
4881         if (addr1 > addr2)
4882                 return 1;
4883
4884         return 0;
4885 }
4886
4887 static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
4888 {
4889         return kvm_io_bus_cmp(p1, p2);
4890 }
4891
4892 static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
4893                              gpa_t addr, int len)
4894 {
4895         struct kvm_io_range *range, key;
4896         int off;
4897
4898         key = (struct kvm_io_range) {
4899                 .addr = addr,
4900                 .len = len,
4901         };
4902
4903         range = bsearch(&key, bus->range, bus->dev_count,
4904                         sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
4905         if (range == NULL)
4906                 return -ENOENT;
4907
4908         off = range - bus->range;
4909
4910         while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
4911                 off--;
4912
4913         return off;
4914 }
4915
4916 static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
4917                               struct kvm_io_range *range, const void *val)
4918 {
4919         int idx;
4920
4921         idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
4922         if (idx < 0)
4923                 return -EOPNOTSUPP;
4924
4925         while (idx < bus->dev_count &&
4926                 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
4927                 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
4928                                         range->len, val))
4929                         return idx;
4930                 idx++;
4931         }
4932
4933         return -EOPNOTSUPP;
4934 }
4935
4936 /* kvm_io_bus_write - called under kvm->slots_lock */
4937 int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
4938                      int len, const void *val)
4939 {
4940         struct kvm_io_bus *bus;
4941         struct kvm_io_range range;
4942         int r;
4943
4944         range = (struct kvm_io_range) {
4945                 .addr = addr,
4946                 .len = len,
4947         };
4948
4949         bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4950         if (!bus)
4951                 return -ENOMEM;
4952         r = __kvm_io_bus_write(vcpu, bus, &range, val);
4953         return r < 0 ? r : 0;
4954 }
4955 EXPORT_SYMBOL_GPL(kvm_io_bus_write);
4956
4957 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */
4958 int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
4959                             gpa_t addr, int len, const void *val, long cookie)
4960 {
4961         struct kvm_io_bus *bus;
4962         struct kvm_io_range range;
4963
4964         range = (struct kvm_io_range) {
4965                 .addr = addr,
4966                 .len = len,
4967         };
4968
4969         bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
4970         if (!bus)
4971                 return -ENOMEM;
4972
4973         /* First try the device referenced by cookie. */
4974         if ((cookie >= 0) && (cookie < bus->dev_count) &&
4975             (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
4976                 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
4977                                         val))
4978                         return cookie;
4979
4980         /*
4981          * cookie contained garbage; fall back to search and return the
4982          * correct cookie value.
4983          */
4984         return __kvm_io_bus_write(vcpu, bus, &range, val);
4985 }
4986
4987 static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
4988                              struct kvm_io_range *range, void *val)
4989 {
4990         int idx;
4991
4992         idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
4993         if (idx < 0)
4994                 return -EOPNOTSUPP;
4995
4996         while (idx < bus->dev_count &&
4997                 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
4998                 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
4999                                        range->len, val))
5000                         return idx;
5001                 idx++;
5002         }
5003
5004         return -EOPNOTSUPP;
5005 }
5006
5007 /* kvm_io_bus_read - called under kvm->slots_lock */
5008 int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
5009                     int len, void *val)
5010 {
5011         struct kvm_io_bus *bus;
5012         struct kvm_io_range range;
5013         int r;
5014
5015         range = (struct kvm_io_range) {
5016                 .addr = addr,
5017                 .len = len,
5018         };
5019
5020         bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5021         if (!bus)
5022                 return -ENOMEM;
5023         r = __kvm_io_bus_read(vcpu, bus, &range, val);
5024         return r < 0 ? r : 0;
5025 }
5026
5027 /* Caller must hold slots_lock. */
5028 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
5029                             int len, struct kvm_io_device *dev)
5030 {
5031         int i;
5032         struct kvm_io_bus *new_bus, *bus;
5033         struct kvm_io_range range;
5034
5035         bus = kvm_get_bus(kvm, bus_idx);
5036         if (!bus)
5037                 return -ENOMEM;
5038
5039         /* exclude ioeventfd which is limited by maximum fd */
5040         if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
5041                 return -ENOSPC;
5042
5043         new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
5044                           GFP_KERNEL_ACCOUNT);
5045         if (!new_bus)
5046                 return -ENOMEM;
5047
5048         range = (struct kvm_io_range) {
5049                 .addr = addr,
5050                 .len = len,
5051                 .dev = dev,
5052         };
5053
5054         for (i = 0; i < bus->dev_count; i++)
5055                 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
5056                         break;
5057
5058         memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
5059         new_bus->dev_count++;
5060         new_bus->range[i] = range;
5061         memcpy(new_bus->range + i + 1, bus->range + i,
5062                 (bus->dev_count - i) * sizeof(struct kvm_io_range));
5063         rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5064         synchronize_srcu_expedited(&kvm->srcu);
5065         kfree(bus);
5066
5067         return 0;
5068 }
5069
5070 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5071                               struct kvm_io_device *dev)
5072 {
5073         int i, j;
5074         struct kvm_io_bus *new_bus, *bus;
5075
5076         lockdep_assert_held(&kvm->slots_lock);
5077
5078         bus = kvm_get_bus(kvm, bus_idx);
5079         if (!bus)
5080                 return 0;
5081
5082         for (i = 0; i < bus->dev_count; i++) {
5083                 if (bus->range[i].dev == dev) {
5084                         break;
5085                 }
5086         }
5087
5088         if (i == bus->dev_count)
5089                 return 0;
5090
5091         new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
5092                           GFP_KERNEL_ACCOUNT);
5093         if (new_bus) {
5094                 memcpy(new_bus, bus, struct_size(bus, range, i));
5095                 new_bus->dev_count--;
5096                 memcpy(new_bus->range + i, bus->range + i + 1,
5097                                 flex_array_size(new_bus, range, new_bus->dev_count - i));
5098         }
5099
5100         rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5101         synchronize_srcu_expedited(&kvm->srcu);
5102
5103         /* Destroy the old bus _after_ installing the (null) bus. */
5104         if (!new_bus) {
5105                 pr_err("kvm: failed to shrink bus, removing it completely\n");
5106                 for (j = 0; j < bus->dev_count; j++) {
5107                         if (j == i)
5108                                 continue;
5109                         kvm_iodevice_destructor(bus->range[j].dev);
5110                 }
5111         }
5112
5113         kfree(bus);
5114         return new_bus ? 0 : -ENOMEM;
5115 }
5116
5117 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
5118                                          gpa_t addr)
5119 {
5120         struct kvm_io_bus *bus;
5121         int dev_idx, srcu_idx;
5122         struct kvm_io_device *iodev = NULL;
5123
5124         srcu_idx = srcu_read_lock(&kvm->srcu);
5125
5126         bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
5127         if (!bus)
5128                 goto out_unlock;
5129
5130         dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
5131         if (dev_idx < 0)
5132                 goto out_unlock;
5133
5134         iodev = bus->range[dev_idx].dev;
5135
5136 out_unlock:
5137         srcu_read_unlock(&kvm->srcu, srcu_idx);
5138
5139         return iodev;
5140 }
5141 EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
5142
5143 static int kvm_debugfs_open(struct inode *inode, struct file *file,
5144                            int (*get)(void *, u64 *), int (*set)(void *, u64),
5145                            const char *fmt)
5146 {
5147         struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
5148                                           inode->i_private;
5149
5150         /*
5151          * The debugfs files are a reference to the kvm struct which
5152         * is still valid when kvm_destroy_vm is called.  kvm_get_kvm_safe
5153         * avoids the race between open and the removal of the debugfs directory.
5154          */
5155         if (!kvm_get_kvm_safe(stat_data->kvm))
5156                 return -ENOENT;
5157
5158         if (simple_attr_open(inode, file, get,
5159                     kvm_stats_debugfs_mode(stat_data->desc) & 0222
5160                     ? set : NULL,
5161                     fmt)) {
5162                 kvm_put_kvm(stat_data->kvm);
5163                 return -ENOMEM;
5164         }
5165
5166         return 0;
5167 }
5168
5169 static int kvm_debugfs_release(struct inode *inode, struct file *file)
5170 {
5171         struct kvm_stat_data *stat_data = (struct kvm_stat_data *)
5172                                           inode->i_private;
5173
5174         simple_attr_release(inode, file);
5175         kvm_put_kvm(stat_data->kvm);
5176
5177         return 0;
5178 }
5179
5180 static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
5181 {
5182         *val = *(u64 *)((void *)(&kvm->stat) + offset);
5183
5184         return 0;
5185 }
5186
5187 static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
5188 {
5189         *(u64 *)((void *)(&kvm->stat) + offset) = 0;
5190
5191         return 0;
5192 }
5193
5194 static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
5195 {
5196         int i;
5197         struct kvm_vcpu *vcpu;
5198
5199         *val = 0;
5200
5201         kvm_for_each_vcpu(i, vcpu, kvm)
5202                 *val += *(u64 *)((void *)(&vcpu->stat) + offset);
5203
5204         return 0;
5205 }
5206
5207 static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
5208 {
5209         int i;
5210         struct kvm_vcpu *vcpu;
5211
5212         kvm_for_each_vcpu(i, vcpu, kvm)
5213                 *(u64 *)((void *)(&vcpu->stat) + offset) = 0;
5214
5215         return 0;
5216 }
5217
5218 static int kvm_stat_data_get(void *data, u64 *val)
5219 {
5220         int r = -EFAULT;
5221         struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
5222
5223         switch (stat_data->kind) {
5224         case KVM_STAT_VM:
5225                 r = kvm_get_stat_per_vm(stat_data->kvm,
5226                                         stat_data->desc->desc.offset, val);
5227                 break;
5228         case KVM_STAT_VCPU:
5229                 r = kvm_get_stat_per_vcpu(stat_data->kvm,
5230                                           stat_data->desc->desc.offset, val);
5231                 break;
5232         }
5233
5234         return r;
5235 }
5236
5237 static int kvm_stat_data_clear(void *data, u64 val)
5238 {
5239         int r = -EFAULT;
5240         struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
5241
5242         if (val)
5243                 return -EINVAL;
5244
5245         switch (stat_data->kind) {
5246         case KVM_STAT_VM:
5247                 r = kvm_clear_stat_per_vm(stat_data->kvm,
5248                                           stat_data->desc->desc.offset);
5249                 break;
5250         case KVM_STAT_VCPU:
5251                 r = kvm_clear_stat_per_vcpu(stat_data->kvm,
5252                                             stat_data->desc->desc.offset);
5253                 break;
5254         }
5255
5256         return r;
5257 }
5258
5259 static int kvm_stat_data_open(struct inode *inode, struct file *file)
5260 {
5261         __simple_attr_check_format("%llu\n", 0ull);
5262         return kvm_debugfs_open(inode, file, kvm_stat_data_get,
5263                                 kvm_stat_data_clear, "%llu\n");
5264 }
5265
5266 static const struct file_operations stat_fops_per_vm = {
5267         .owner = THIS_MODULE,
5268         .open = kvm_stat_data_open,
5269         .release = kvm_debugfs_release,
5270         .read = simple_attr_read,
5271         .write = simple_attr_write,
5272         .llseek = no_llseek,
5273 };
5274
5275 static int vm_stat_get(void *_offset, u64 *val)
5276 {
5277         unsigned offset = (long)_offset;
5278         struct kvm *kvm;
5279         u64 tmp_val;
5280
5281         *val = 0;
5282         mutex_lock(&kvm_lock);
5283         list_for_each_entry(kvm, &vm_list, vm_list) {
5284                 kvm_get_stat_per_vm(kvm, offset, &tmp_val);
5285                 *val += tmp_val;
5286         }
5287         mutex_unlock(&kvm_lock);
5288         return 0;
5289 }
5290
5291 static int vm_stat_clear(void *_offset, u64 val)
5292 {
5293         unsigned offset = (long)_offset;
5294         struct kvm *kvm;
5295
5296         if (val)
5297                 return -EINVAL;
5298
5299         mutex_lock(&kvm_lock);
5300         list_for_each_entry(kvm, &vm_list, vm_list) {
5301                 kvm_clear_stat_per_vm(kvm, offset);
5302         }
5303         mutex_unlock(&kvm_lock);
5304
5305         return 0;
5306 }
5307
5308 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
5309 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
5310
5311 static int vcpu_stat_get(void *_offset, u64 *val)
5312 {
5313         unsigned offset = (long)_offset;
5314         struct kvm *kvm;
5315         u64 tmp_val;
5316
5317         *val = 0;
5318         mutex_lock(&kvm_lock);
5319         list_for_each_entry(kvm, &vm_list, vm_list) {
5320                 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
5321                 *val += tmp_val;
5322         }
5323         mutex_unlock(&kvm_lock);
5324         return 0;
5325 }
5326
5327 static int vcpu_stat_clear(void *_offset, u64 val)
5328 {
5329         unsigned offset = (long)_offset;
5330         struct kvm *kvm;
5331
5332         if (val)
5333                 return -EINVAL;
5334
5335         mutex_lock(&kvm_lock);
5336         list_for_each_entry(kvm, &vm_list, vm_list) {
5337                 kvm_clear_stat_per_vcpu(kvm, offset);
5338         }
5339         mutex_unlock(&kvm_lock);
5340
5341         return 0;
5342 }
5343
5344 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
5345                         "%llu\n");
5346 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
5347
5348 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
5349 {
5350         struct kobj_uevent_env *env;
5351         unsigned long long created, active;
5352
5353         if (!kvm_dev.this_device || !kvm)
5354                 return;
5355
5356         mutex_lock(&kvm_lock);
5357         if (type == KVM_EVENT_CREATE_VM) {
5358                 kvm_createvm_count++;
5359                 kvm_active_vms++;
5360         } else if (type == KVM_EVENT_DESTROY_VM) {
5361                 kvm_active_vms--;
5362         }
5363         created = kvm_createvm_count;
5364         active = kvm_active_vms;
5365         mutex_unlock(&kvm_lock);
5366
5367         env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT);
5368         if (!env)
5369                 return;
5370
5371         add_uevent_var(env, "CREATED=%llu", created);
5372         add_uevent_var(env, "COUNT=%llu", active);
5373
5374         if (type == KVM_EVENT_CREATE_VM) {
5375                 add_uevent_var(env, "EVENT=create");
5376                 kvm->userspace_pid = task_pid_nr(current);
5377         } else if (type == KVM_EVENT_DESTROY_VM) {
5378                 add_uevent_var(env, "EVENT=destroy");
5379         }
5380         add_uevent_var(env, "PID=%d", kvm->userspace_pid);
5381
5382         if (!IS_ERR(kvm->debugfs_dentry)) {
5383                 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
5384
5385                 if (p) {
5386                         tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
5387                         if (!IS_ERR(tmp))
5388                                 add_uevent_var(env, "STATS_PATH=%s", tmp);
5389                         kfree(p);
5390                 }
5391         }
5392         /* no need for checks, since we are adding at most only 5 keys */
5393         env->envp[env->envp_idx++] = NULL;
5394         kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
5395         kfree(env);
5396 }
5397
5398 static void kvm_init_debug(void)
5399 {
5400         const struct file_operations *fops;
5401         const struct _kvm_stats_desc *pdesc;
5402         int i;
5403
5404         kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);
5405
5406         for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
5407                 pdesc = &kvm_vm_stats_desc[i];
5408                 if (kvm_stats_debugfs_mode(pdesc) & 0222)
5409                         fops = &vm_stat_fops;
5410                 else
5411                         fops = &vm_stat_readonly_fops;
5412                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5413                                 kvm_debugfs_dir,
5414                                 (void *)(long)pdesc->desc.offset, fops);
5415         }
5416
5417         for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
5418                 pdesc = &kvm_vcpu_stats_desc[i];
5419                 if (kvm_stats_debugfs_mode(pdesc) & 0222)
5420                         fops = &vcpu_stat_fops;
5421                 else
5422                         fops = &vcpu_stat_readonly_fops;
5423                 debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
5424                                 kvm_debugfs_dir,
5425                                 (void *)(long)pdesc->desc.offset, fops);
5426         }
5427 }
5428
5429 static int kvm_suspend(void)
5430 {
5431         if (kvm_usage_count)
5432                 hardware_disable_nolock(NULL);
5433         return 0;
5434 }
5435
5436 static void kvm_resume(void)
5437 {
5438         if (kvm_usage_count) {
5439                 lockdep_assert_not_held(&kvm_count_lock);
5440                 hardware_enable_nolock(NULL);
5441         }
5442 }
5443
5444 static struct syscore_ops kvm_syscore_ops = {
5445         .suspend = kvm_suspend,
5446         .resume = kvm_resume,
5447 };
5448
5449 static inline
5450 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
5451 {
5452         return container_of(pn, struct kvm_vcpu, preempt_notifier);
5453 }
5454
5455 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
5456 {
5457         struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5458
5459         WRITE_ONCE(vcpu->preempted, false);
5460         WRITE_ONCE(vcpu->ready, false);
5461
5462         __this_cpu_write(kvm_running_vcpu, vcpu);
5463         kvm_arch_sched_in(vcpu, cpu);
5464         kvm_arch_vcpu_load(vcpu, cpu);
5465 }
5466
5467 static void kvm_sched_out(struct preempt_notifier *pn,
5468                           struct task_struct *next)
5469 {
5470         struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5471
5472         if (current->on_rq) {
5473                 WRITE_ONCE(vcpu->preempted, true);
5474                 WRITE_ONCE(vcpu->ready, true);
5475         }
5476         kvm_arch_vcpu_put(vcpu);
5477         __this_cpu_write(kvm_running_vcpu, NULL);
5478 }
5479
5480 /**
5481  * kvm_get_running_vcpu - get the vcpu running on the current CPU.
5482  *
5483  * We can disable preemption locally around accessing the per-CPU variable,
5484  * and use the resolved vcpu pointer after enabling preemption again,
5485  * because even if the current thread is migrated to another CPU, reading
5486  * the per-CPU value later will give us the same value as we update the
5487  * per-CPU variable in the preempt notifier handlers.
5488  */
5489 struct kvm_vcpu *kvm_get_running_vcpu(void)
5490 {
5491         struct kvm_vcpu *vcpu;
5492
5493         preempt_disable();
5494         vcpu = __this_cpu_read(kvm_running_vcpu);
5495         preempt_enable();
5496
5497         return vcpu;
5498 }
5499 EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
5500
5501 /**
5502  * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
5503  */
5504 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
5505 {
5506         return &kvm_running_vcpu;
5507 }
5508
5509 struct kvm_cpu_compat_check {
5510         void *opaque;
5511         int *ret;
5512 };
5513
5514 static void check_processor_compat(void *data)
5515 {
5516         struct kvm_cpu_compat_check *c = data;
5517
5518         *c->ret = kvm_arch_check_processor_compat(c->opaque);
5519 }
5520
5521 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
5522                   struct module *module)
5523 {
5524         struct kvm_cpu_compat_check c;
5525         int r;
5526         int cpu;
5527
5528         r = kvm_arch_init(opaque);
5529         if (r)
5530                 goto out_fail;
5531
5532         /*
5533          * kvm_arch_init makes sure there's at most one caller
5534          * for architectures that support multiple implementations,
5535          * like intel and amd on x86.
5536          * kvm_arch_init must be called before kvm_irqfd_init to avoid creating
5537          * conflicts in case kvm is already setup for another implementation.
5538          */
5539         r = kvm_irqfd_init();
5540         if (r)
5541                 goto out_irqfd;
5542
5543         if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
5544                 r = -ENOMEM;
5545                 goto out_free_0;
5546         }
5547
5548         r = kvm_arch_hardware_setup(opaque);
5549         if (r < 0)
5550                 goto out_free_1;
5551
5552         c.ret = &r;
5553         c.opaque = opaque;
5554         for_each_online_cpu(cpu) {
5555                 smp_call_function_single(cpu, check_processor_compat, &c, 1);
5556                 if (r < 0)
5557                         goto out_free_2;
5558         }
5559
5560         r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
5561                                       kvm_starting_cpu, kvm_dying_cpu);
5562         if (r)
5563                 goto out_free_2;
5564         register_reboot_notifier(&kvm_reboot_notifier);
5565
5566         /* A kmem cache lets us meet the alignment requirements of fx_save. */
5567         if (!vcpu_align)
5568                 vcpu_align = __alignof__(struct kvm_vcpu);
5569         kvm_vcpu_cache =
5570                 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
5571                                            SLAB_ACCOUNT,
5572                                            offsetof(struct kvm_vcpu, arch),
5573                                            offsetofend(struct kvm_vcpu, stats_id)
5574                                            - offsetof(struct kvm_vcpu, arch),
5575                                            NULL);
5576         if (!kvm_vcpu_cache) {
5577                 r = -ENOMEM;
5578                 goto out_free_3;
5579         }
5580
5581         r = kvm_async_pf_init();
5582         if (r)
5583                 goto out_free;
5584
5585         kvm_chardev_ops.owner = module;
5586         kvm_vm_fops.owner = module;
5587         kvm_vcpu_fops.owner = module;
5588
5589         r = misc_register(&kvm_dev);
5590         if (r) {
5591                 pr_err("kvm: misc device register failed\n");
5592                 goto out_unreg;
5593         }
5594
5595         register_syscore_ops(&kvm_syscore_ops);
5596
5597         kvm_preempt_ops.sched_in = kvm_sched_in;
5598         kvm_preempt_ops.sched_out = kvm_sched_out;
5599
5600         kvm_init_debug();
5601
5602         r = kvm_vfio_ops_init();
5603         WARN_ON(r);
5604
5605         return 0;
5606
5607 out_unreg:
5608         kvm_async_pf_deinit();
5609 out_free:
5610         kmem_cache_destroy(kvm_vcpu_cache);
5611 out_free_3:
5612         unregister_reboot_notifier(&kvm_reboot_notifier);
5613         cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
5614 out_free_2:
5615         kvm_arch_hardware_unsetup();
5616 out_free_1:
5617         free_cpumask_var(cpus_hardware_enabled);
5618 out_free_0:
5619         kvm_irqfd_exit();
5620 out_irqfd:
5621         kvm_arch_exit();
5622 out_fail:
5623         return r;
5624 }
5625 EXPORT_SYMBOL_GPL(kvm_init);
5626
5627 void kvm_exit(void)
5628 {
5629         debugfs_remove_recursive(kvm_debugfs_dir);
5630         misc_deregister(&kvm_dev);
5631         kmem_cache_destroy(kvm_vcpu_cache);
5632         kvm_async_pf_deinit();
5633         unregister_syscore_ops(&kvm_syscore_ops);
5634         unregister_reboot_notifier(&kvm_reboot_notifier);
5635         cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
5636         on_each_cpu(hardware_disable_nolock, NULL, 1);
5637         kvm_arch_hardware_unsetup();
5638         kvm_arch_exit();
5639         kvm_irqfd_exit();
5640         free_cpumask_var(cpus_hardware_enabled);
5641         kvm_vfio_ops_exit();
5642 }
5643 EXPORT_SYMBOL_GPL(kvm_exit);
5644
5645 struct kvm_vm_worker_thread_context {
5646         struct kvm *kvm;
5647         struct task_struct *parent;
5648         struct completion init_done;
5649         kvm_vm_thread_fn_t thread_fn;
5650         uintptr_t data;
5651         int err;
5652 };
5653
5654 static int kvm_vm_worker_thread(void *context)
5655 {
5656         /*
5657          * The init_context is allocated on the stack of the parent thread, so
5658          * we have to locally copy anything that is needed beyond initialization
5659          */
5660         struct kvm_vm_worker_thread_context *init_context = context;
5661         struct kvm *kvm = init_context->kvm;
5662         kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
5663         uintptr_t data = init_context->data;
5664         int err;
5665
5666         err = kthread_park(current);
5667         /* kthread_park(current) is never supposed to return an error */
5668         WARN_ON(err != 0);
5669         if (err)
5670                 goto init_complete;
5671
5672         err = cgroup_attach_task_all(init_context->parent, current);
5673         if (err) {
5674                 kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
5675                         __func__, err);
5676                 goto init_complete;
5677         }
5678
5679         set_user_nice(current, task_nice(init_context->parent));
5680
5681 init_complete:
5682         init_context->err = err;
5683         complete(&init_context->init_done);
5684         init_context = NULL;
5685
5686         if (err)
5687                 return err;
5688
5689         /* Wait to be woken up by the spawner before proceeding. */
5690         kthread_parkme();
5691
5692         if (!kthread_should_stop())
5693                 err = thread_fn(kvm, data);
5694
5695         return err;
5696 }
5697
5698 int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
5699                                 uintptr_t data, const char *name,
5700                                 struct task_struct **thread_ptr)
5701 {
5702         struct kvm_vm_worker_thread_context init_context = {};
5703         struct task_struct *thread;
5704
5705         *thread_ptr = NULL;
5706         init_context.kvm = kvm;
5707         init_context.parent = current;
5708         init_context.thread_fn = thread_fn;
5709         init_context.data = data;
5710         init_completion(&init_context.init_done);
5711
5712         thread = kthread_run(kvm_vm_worker_thread, &init_context,
5713                              "%s-%d", name, task_pid_nr(current));
5714         if (IS_ERR(thread))
5715                 return PTR_ERR(thread);
5716
5717         /* kthread_run is never supposed to return NULL */
5718         WARN_ON(thread == NULL);
5719
5720         wait_for_completion(&init_context.init_done);
5721
5722         if (!init_context.err)
5723                 *thread_ptr = thread;
5724
5725         return init_context.err;
5726 }