GNU Linux-libre 5.4.274-gnu1
[releases.git] / arch / x86 / kvm / mmu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Kernel-based Virtual Machine driver for Linux
4  *
5  * This module enables machines with Intel VT-x extensions to run virtual
6  * machines without emulation or binary translation.
7  *
8  * MMU support
9  *
10  * Copyright (C) 2006 Qumranet, Inc.
11  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
12  *
13  * Authors:
14  *   Yaniv Kamay  <yaniv@qumranet.com>
15  *   Avi Kivity   <avi@qumranet.com>
16  */
17
18 #include "irq.h"
19 #include "mmu.h"
20 #include "x86.h"
21 #include "kvm_cache_regs.h"
22 #include "cpuid.h"
23
24 #include <linux/kvm_host.h>
25 #include <linux/types.h>
26 #include <linux/string.h>
27 #include <linux/mm.h>
28 #include <linux/highmem.h>
29 #include <linux/moduleparam.h>
30 #include <linux/export.h>
31 #include <linux/swap.h>
32 #include <linux/hugetlb.h>
33 #include <linux/compiler.h>
34 #include <linux/srcu.h>
35 #include <linux/slab.h>
36 #include <linux/sched/signal.h>
37 #include <linux/uaccess.h>
38 #include <linux/hash.h>
39 #include <linux/kern_levels.h>
40 #include <linux/kthread.h>
41
42 #include <asm/page.h>
43 #include <asm/pat.h>
44 #include <asm/cmpxchg.h>
45 #include <asm/e820/api.h>
46 #include <asm/io.h>
47 #include <asm/vmx.h>
48 #include <asm/kvm_page_track.h>
49 #include "trace.h"
50
51 extern bool itlb_multihit_kvm_mitigation;
52
53 static int __read_mostly nx_huge_pages = -1;
54 #ifdef CONFIG_PREEMPT_RT
55 /* Recovery can cause latency spikes, disable it for PREEMPT_RT.  */
56 static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
57 #else
58 static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
59 #endif
60
61 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
62 static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
63
64 static struct kernel_param_ops nx_huge_pages_ops = {
65         .set = set_nx_huge_pages,
66         .get = param_get_bool,
67 };
68
69 static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
70         .set = set_nx_huge_pages_recovery_ratio,
71         .get = param_get_uint,
72 };
73
74 module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
75 __MODULE_PARM_TYPE(nx_huge_pages, "bool");
76 module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
77                 &nx_huge_pages_recovery_ratio, 0644);
78 __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
79
80 /*
81  * When setting this variable to true it enables Two-Dimensional-Paging
82  * where the hardware walks 2 page tables:
83  * 1. the guest-virtual to guest-physical
84  * 2. while doing 1. it walks guest-physical to host-physical
85  * If the hardware supports that we don't need to do shadow paging.
86  */
87 bool tdp_enabled = false;
88
89 enum {
90         AUDIT_PRE_PAGE_FAULT,
91         AUDIT_POST_PAGE_FAULT,
92         AUDIT_PRE_PTE_WRITE,
93         AUDIT_POST_PTE_WRITE,
94         AUDIT_PRE_SYNC,
95         AUDIT_POST_SYNC
96 };
97
98 #undef MMU_DEBUG
99
100 #ifdef MMU_DEBUG
101 static bool dbg = 0;
102 module_param(dbg, bool, 0644);
103
104 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
105 #define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
106 #define MMU_WARN_ON(x) WARN_ON(x)
107 #else
108 #define pgprintk(x...) do { } while (0)
109 #define rmap_printk(x...) do { } while (0)
110 #define MMU_WARN_ON(x) do { } while (0)
111 #endif
112
113 #define PTE_PREFETCH_NUM                8
114
115 #define PT_FIRST_AVAIL_BITS_SHIFT 10
116 #define PT64_SECOND_AVAIL_BITS_SHIFT 54
117
118 /*
119  * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
120  * Access Tracking SPTEs.
121  */
122 #define SPTE_SPECIAL_MASK (3ULL << 52)
123 #define SPTE_AD_ENABLED_MASK (0ULL << 52)
124 #define SPTE_AD_DISABLED_MASK (1ULL << 52)
125 #define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
126 #define SPTE_MMIO_MASK (3ULL << 52)
127
128 #define PT64_LEVEL_BITS 9
129
130 #define PT64_LEVEL_SHIFT(level) \
131                 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
132
133 #define PT64_INDEX(address, level)\
134         (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
135
136
137 #define PT32_LEVEL_BITS 10
138
139 #define PT32_LEVEL_SHIFT(level) \
140                 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
141
142 #define PT32_LVL_OFFSET_MASK(level) \
143         (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
144                                                 * PT32_LEVEL_BITS))) - 1))
145
146 #define PT32_INDEX(address, level)\
147         (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
148
149
150 #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
151 #define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
152 #else
153 #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
154 #endif
155 #define PT64_LVL_ADDR_MASK(level) \
156         (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
157                                                 * PT64_LEVEL_BITS))) - 1))
158 #define PT64_LVL_OFFSET_MASK(level) \
159         (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
160                                                 * PT64_LEVEL_BITS))) - 1))
161
162 #define PT32_BASE_ADDR_MASK PAGE_MASK
163 #define PT32_DIR_BASE_ADDR_MASK \
164         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
165 #define PT32_LVL_ADDR_MASK(level) \
166         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
167                                             * PT32_LEVEL_BITS))) - 1))
168
169 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
170                         | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
171
172 #define ACC_EXEC_MASK    1
173 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
174 #define ACC_USER_MASK    PT_USER_MASK
175 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
176
177 /* The mask for the R/X bits in EPT PTEs */
178 #define PT64_EPT_READABLE_MASK                  0x1ull
179 #define PT64_EPT_EXECUTABLE_MASK                0x4ull
180
181 #include <trace/events/kvm.h>
182
183 #define SPTE_HOST_WRITEABLE     (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
184 #define SPTE_MMU_WRITEABLE      (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
185
186 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
187
188 /* make pte_list_desc fit well in cache line */
189 #define PTE_LIST_EXT 3
190
191 /*
192  * Return values of handle_mmio_page_fault and mmu.page_fault:
193  * RET_PF_RETRY: let CPU fault again on the address.
194  * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
195  *
196  * For handle_mmio_page_fault only:
197  * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
198  */
199 enum {
200         RET_PF_RETRY = 0,
201         RET_PF_EMULATE = 1,
202         RET_PF_INVALID = 2,
203 };
204
205 struct pte_list_desc {
206         u64 *sptes[PTE_LIST_EXT];
207         struct pte_list_desc *more;
208 };
209
210 struct kvm_shadow_walk_iterator {
211         u64 addr;
212         hpa_t shadow_addr;
213         u64 *sptep;
214         int level;
215         unsigned index;
216 };
217
218 static const union kvm_mmu_page_role mmu_base_role_mask = {
219         .cr0_wp = 1,
220         .gpte_is_8_bytes = 1,
221         .nxe = 1,
222         .smep_andnot_wp = 1,
223         .smap_andnot_wp = 1,
224         .smm = 1,
225         .guest_mode = 1,
226         .ad_disabled = 1,
227 };
228
229 #define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker)     \
230         for (shadow_walk_init_using_root(&(_walker), (_vcpu),              \
231                                          (_root), (_addr));                \
232              shadow_walk_okay(&(_walker));                                 \
233              shadow_walk_next(&(_walker)))
234
235 #define for_each_shadow_entry(_vcpu, _addr, _walker)            \
236         for (shadow_walk_init(&(_walker), _vcpu, _addr);        \
237              shadow_walk_okay(&(_walker));                      \
238              shadow_walk_next(&(_walker)))
239
240 #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte)     \
241         for (shadow_walk_init(&(_walker), _vcpu, _addr);                \
242              shadow_walk_okay(&(_walker)) &&                            \
243                 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });  \
244              __shadow_walk_next(&(_walker), spte))
245
246 static struct kmem_cache *pte_list_desc_cache;
247 static struct kmem_cache *mmu_page_header_cache;
248 static struct percpu_counter kvm_total_used_mmu_pages;
249
250 static u64 __read_mostly shadow_nx_mask;
251 static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
252 static u64 __read_mostly shadow_user_mask;
253 static u64 __read_mostly shadow_accessed_mask;
254 static u64 __read_mostly shadow_dirty_mask;
255 static u64 __read_mostly shadow_mmio_mask;
256 static u64 __read_mostly shadow_mmio_value;
257 static u64 __read_mostly shadow_mmio_access_mask;
258 static u64 __read_mostly shadow_present_mask;
259 static u64 __read_mostly shadow_me_mask;
260
261 /*
262  * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK;
263  * shadow_acc_track_mask is the set of bits to be cleared in non-accessed
264  * pages.
265  */
266 static u64 __read_mostly shadow_acc_track_mask;
267
268 /*
269  * The mask/shift to use for saving the original R/X bits when marking the PTE
270  * as not-present for access tracking purposes. We do not save the W bit as the
271  * PTEs being access tracked also need to be dirty tracked, so the W bit will be
272  * restored only when a write is attempted to the page.
273  */
274 static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
275                                                     PT64_EPT_EXECUTABLE_MASK;
276 static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
277
278 /*
279  * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order
280  * to guard against L1TF attacks.
281  */
282 static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
283
284 /*
285  * The number of high-order 1 bits to use in the mask above.
286  */
287 static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
288
289 /*
290  * In some cases, we need to preserve the GFN of a non-present or reserved
291  * SPTE when we usurp the upper five bits of the physical address space to
292  * defend against L1TF, e.g. for MMIO SPTEs.  To preserve the GFN, we'll
293  * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask
294  * left into the reserved bits, i.e. the GFN in the SPTE will be split into
295  * high and low parts.  This mask covers the lower bits of the GFN.
296  */
297 static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
298
299 /*
300  * The number of non-reserved physical address bits irrespective of features
301  * that repurpose legal bits, e.g. MKTME.
302  */
303 static u8 __read_mostly shadow_phys_bits;
304
305 static void mmu_spte_set(u64 *sptep, u64 spte);
306 static bool is_executable_pte(u64 spte);
307 static union kvm_mmu_page_role
308 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
309
310 #define CREATE_TRACE_POINTS
311 #include "mmutrace.h"
312
313
314 static inline bool kvm_available_flush_tlb_with_range(void)
315 {
316         return kvm_x86_ops->tlb_remote_flush_with_range;
317 }
318
319 static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
320                 struct kvm_tlb_range *range)
321 {
322         int ret = -ENOTSUPP;
323
324         if (range && kvm_x86_ops->tlb_remote_flush_with_range)
325                 ret = kvm_x86_ops->tlb_remote_flush_with_range(kvm, range);
326
327         if (ret)
328                 kvm_flush_remote_tlbs(kvm);
329 }
330
331 static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
332                 u64 start_gfn, u64 pages)
333 {
334         struct kvm_tlb_range range;
335
336         range.start_gfn = start_gfn;
337         range.pages = pages;
338
339         kvm_flush_remote_tlbs_with_range(kvm, &range);
340 }
341
342 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask)
343 {
344         BUG_ON((u64)(unsigned)access_mask != access_mask);
345         BUG_ON((mmio_mask & mmio_value) != mmio_value);
346         WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len));
347         WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
348         shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
349         shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
350         shadow_mmio_access_mask = access_mask;
351 }
352 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
353
354 static bool is_mmio_spte(u64 spte)
355 {
356         return (spte & shadow_mmio_mask) == shadow_mmio_value;
357 }
358
359 static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
360 {
361         return sp->role.ad_disabled;
362 }
363
364 static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
365 {
366         /*
367          * When using the EPT page-modification log, the GPAs in the log
368          * would come from L2 rather than L1.  Therefore, we need to rely
369          * on write protection to record dirty pages.  This also bypasses
370          * PML, since writes now result in a vmexit.
371          */
372         return vcpu->arch.mmu == &vcpu->arch.guest_mmu;
373 }
374
375 static inline bool spte_ad_enabled(u64 spte)
376 {
377         MMU_WARN_ON(is_mmio_spte(spte));
378         return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK;
379 }
380
381 static inline bool spte_ad_need_write_protect(u64 spte)
382 {
383         MMU_WARN_ON(is_mmio_spte(spte));
384         return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
385 }
386
387 static bool is_nx_huge_page_enabled(void)
388 {
389         return READ_ONCE(nx_huge_pages);
390 }
391
392 static inline u64 spte_shadow_accessed_mask(u64 spte)
393 {
394         MMU_WARN_ON(is_mmio_spte(spte));
395         return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
396 }
397
398 static inline u64 spte_shadow_dirty_mask(u64 spte)
399 {
400         MMU_WARN_ON(is_mmio_spte(spte));
401         return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
402 }
403
404 static inline bool is_access_track_spte(u64 spte)
405 {
406         return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
407 }
408
409 /*
410  * Due to limited space in PTEs, the MMIO generation is a 18 bit subset of
411  * the memslots generation and is derived as follows:
412  *
413  * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
414  * Bits 9-17 of the MMIO generation are propagated to spte bits 54-62
415  *
416  * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
417  * the MMIO generation number, as doing so would require stealing a bit from
418  * the "real" generation number and thus effectively halve the maximum number
419  * of MMIO generations that can be handled before encountering a wrap (which
420  * requires a full MMU zap).  The flag is instead explicitly queried when
421  * checking for MMIO spte cache hits.
422  */
423
424 #define MMIO_SPTE_GEN_LOW_START         3
425 #define MMIO_SPTE_GEN_LOW_END           11
426
427 #define MMIO_SPTE_GEN_HIGH_START        PT64_SECOND_AVAIL_BITS_SHIFT
428 #define MMIO_SPTE_GEN_HIGH_END          62
429
430 #define MMIO_SPTE_GEN_LOW_MASK          GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
431                                                     MMIO_SPTE_GEN_LOW_START)
432 #define MMIO_SPTE_GEN_HIGH_MASK         GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
433                                                     MMIO_SPTE_GEN_HIGH_START)
434
435 #define MMIO_SPTE_GEN_LOW_BITS          (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1)
436 #define MMIO_SPTE_GEN_HIGH_BITS         (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1)
437
438 /* remember to adjust the comment above as well if you change these */
439 static_assert(MMIO_SPTE_GEN_LOW_BITS == 9 && MMIO_SPTE_GEN_HIGH_BITS == 9);
440
441 #define MMIO_SPTE_GEN_LOW_SHIFT         (MMIO_SPTE_GEN_LOW_START - 0)
442 #define MMIO_SPTE_GEN_HIGH_SHIFT        (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS)
443
444 #define MMIO_SPTE_GEN_MASK              GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0)
445
446 static u64 generation_mmio_spte_mask(u64 gen)
447 {
448         u64 mask;
449
450         WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
451         BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
452
453         mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK;
454         mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK;
455         return mask;
456 }
457
458 static u64 get_mmio_spte_generation(u64 spte)
459 {
460         u64 gen;
461
462         gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_SHIFT;
463         gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_SHIFT;
464         return gen;
465 }
466
467 static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
468                            unsigned access)
469 {
470         u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
471         u64 mask = generation_mmio_spte_mask(gen);
472         u64 gpa = gfn << PAGE_SHIFT;
473
474         access &= shadow_mmio_access_mask;
475         mask |= shadow_mmio_value | access;
476         mask |= gpa | shadow_nonpresent_or_rsvd_mask;
477         mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
478                 << shadow_nonpresent_or_rsvd_mask_len;
479
480         trace_mark_mmio_spte(sptep, gfn, access, gen);
481         mmu_spte_set(sptep, mask);
482 }
483
484 static gfn_t get_mmio_spte_gfn(u64 spte)
485 {
486         u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
487
488         gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
489                & shadow_nonpresent_or_rsvd_mask;
490
491         return gpa >> PAGE_SHIFT;
492 }
493
494 static unsigned get_mmio_spte_access(u64 spte)
495 {
496         return spte & shadow_mmio_access_mask;
497 }
498
499 static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
500                           kvm_pfn_t pfn, unsigned access)
501 {
502         if (unlikely(is_noslot_pfn(pfn))) {
503                 mark_mmio_spte(vcpu, sptep, gfn, access);
504                 return true;
505         }
506
507         return false;
508 }
509
510 static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
511 {
512         u64 kvm_gen, spte_gen, gen;
513
514         gen = kvm_vcpu_memslots(vcpu)->generation;
515         if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
516                 return false;
517
518         kvm_gen = gen & MMIO_SPTE_GEN_MASK;
519         spte_gen = get_mmio_spte_generation(spte);
520
521         trace_check_mmio_spte(spte, kvm_gen, spte_gen);
522         return likely(kvm_gen == spte_gen);
523 }
524
525 /*
526  * Sets the shadow PTE masks used by the MMU.
527  *
528  * Assumptions:
529  *  - Setting either @accessed_mask or @dirty_mask requires setting both
530  *  - At least one of @accessed_mask or @acc_track_mask must be set
531  */
532 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
533                 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
534                 u64 acc_track_mask, u64 me_mask)
535 {
536         BUG_ON(!dirty_mask != !accessed_mask);
537         BUG_ON(!accessed_mask && !acc_track_mask);
538         BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK);
539
540         shadow_user_mask = user_mask;
541         shadow_accessed_mask = accessed_mask;
542         shadow_dirty_mask = dirty_mask;
543         shadow_nx_mask = nx_mask;
544         shadow_x_mask = x_mask;
545         shadow_present_mask = p_mask;
546         shadow_acc_track_mask = acc_track_mask;
547         shadow_me_mask = me_mask;
548 }
549 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
550
551 static u8 kvm_get_shadow_phys_bits(void)
552 {
553         /*
554          * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
555          * in CPU detection code, but the processor treats those reduced bits as
556          * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
557          * the physical address bits reported by CPUID.
558          */
559         if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
560                 return cpuid_eax(0x80000008) & 0xff;
561
562         /*
563          * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
564          * custom CPUID.  Proceed with whatever the kernel found since these features
565          * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
566          */
567         return boot_cpu_data.x86_phys_bits;
568 }
569
570 static void kvm_mmu_reset_all_pte_masks(void)
571 {
572         u8 low_phys_bits;
573
574         shadow_user_mask = 0;
575         shadow_accessed_mask = 0;
576         shadow_dirty_mask = 0;
577         shadow_nx_mask = 0;
578         shadow_x_mask = 0;
579         shadow_mmio_mask = 0;
580         shadow_present_mask = 0;
581         shadow_acc_track_mask = 0;
582
583         shadow_phys_bits = kvm_get_shadow_phys_bits();
584
585         /*
586          * If the CPU has 46 or less physical address bits, then set an
587          * appropriate mask to guard against L1TF attacks. Otherwise, it is
588          * assumed that the CPU is not vulnerable to L1TF.
589          *
590          * Some Intel CPUs address the L1 cache using more PA bits than are
591          * reported by CPUID. Use the PA width of the L1 cache when possible
592          * to achieve more effective mitigation, e.g. if system RAM overlaps
593          * the most significant bits of legal physical address space.
594          */
595         shadow_nonpresent_or_rsvd_mask = 0;
596         low_phys_bits = boot_cpu_data.x86_phys_bits;
597         if (boot_cpu_has_bug(X86_BUG_L1TF) &&
598             !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >=
599                           52 - shadow_nonpresent_or_rsvd_mask_len)) {
600                 low_phys_bits = boot_cpu_data.x86_cache_bits
601                         - shadow_nonpresent_or_rsvd_mask_len;
602                 shadow_nonpresent_or_rsvd_mask =
603                         rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1);
604         }
605
606         shadow_nonpresent_or_rsvd_lower_gfn_mask =
607                 GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
608 }
609
610 static int is_cpuid_PSE36(void)
611 {
612         return 1;
613 }
614
615 static int is_nx(struct kvm_vcpu *vcpu)
616 {
617         return vcpu->arch.efer & EFER_NX;
618 }
619
620 static int is_shadow_present_pte(u64 pte)
621 {
622         return (pte != 0) && !is_mmio_spte(pte);
623 }
624
625 static int is_large_pte(u64 pte)
626 {
627         return pte & PT_PAGE_SIZE_MASK;
628 }
629
630 static int is_last_spte(u64 pte, int level)
631 {
632         if (level == PT_PAGE_TABLE_LEVEL)
633                 return 1;
634         if (is_large_pte(pte))
635                 return 1;
636         return 0;
637 }
638
639 static bool is_executable_pte(u64 spte)
640 {
641         return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
642 }
643
644 static kvm_pfn_t spte_to_pfn(u64 pte)
645 {
646         return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
647 }
648
649 static gfn_t pse36_gfn_delta(u32 gpte)
650 {
651         int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
652
653         return (gpte & PT32_DIR_PSE36_MASK) << shift;
654 }
655
656 #ifdef CONFIG_X86_64
657 static void __set_spte(u64 *sptep, u64 spte)
658 {
659         WRITE_ONCE(*sptep, spte);
660 }
661
662 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
663 {
664         WRITE_ONCE(*sptep, spte);
665 }
666
667 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
668 {
669         return xchg(sptep, spte);
670 }
671
672 static u64 __get_spte_lockless(u64 *sptep)
673 {
674         return READ_ONCE(*sptep);
675 }
676 #else
677 union split_spte {
678         struct {
679                 u32 spte_low;
680                 u32 spte_high;
681         };
682         u64 spte;
683 };
684
685 static void count_spte_clear(u64 *sptep, u64 spte)
686 {
687         struct kvm_mmu_page *sp =  page_header(__pa(sptep));
688
689         if (is_shadow_present_pte(spte))
690                 return;
691
692         /* Ensure the spte is completely set before we increase the count */
693         smp_wmb();
694         sp->clear_spte_count++;
695 }
696
697 static void __set_spte(u64 *sptep, u64 spte)
698 {
699         union split_spte *ssptep, sspte;
700
701         ssptep = (union split_spte *)sptep;
702         sspte = (union split_spte)spte;
703
704         ssptep->spte_high = sspte.spte_high;
705
706         /*
707          * If we map the spte from nonpresent to present, We should store
708          * the high bits firstly, then set present bit, so cpu can not
709          * fetch this spte while we are setting the spte.
710          */
711         smp_wmb();
712
713         WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
714 }
715
716 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
717 {
718         union split_spte *ssptep, sspte;
719
720         ssptep = (union split_spte *)sptep;
721         sspte = (union split_spte)spte;
722
723         WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
724
725         /*
726          * If we map the spte from present to nonpresent, we should clear
727          * present bit firstly to avoid vcpu fetch the old high bits.
728          */
729         smp_wmb();
730
731         ssptep->spte_high = sspte.spte_high;
732         count_spte_clear(sptep, spte);
733 }
734
735 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
736 {
737         union split_spte *ssptep, sspte, orig;
738
739         ssptep = (union split_spte *)sptep;
740         sspte = (union split_spte)spte;
741
742         /* xchg acts as a barrier before the setting of the high bits */
743         orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
744         orig.spte_high = ssptep->spte_high;
745         ssptep->spte_high = sspte.spte_high;
746         count_spte_clear(sptep, spte);
747
748         return orig.spte;
749 }
750
751 /*
752  * The idea using the light way get the spte on x86_32 guest is from
753  * gup_get_pte (mm/gup.c).
754  *
755  * An spte tlb flush may be pending, because kvm_set_pte_rmapp
756  * coalesces them and we are running out of the MMU lock.  Therefore
757  * we need to protect against in-progress updates of the spte.
758  *
759  * Reading the spte while an update is in progress may get the old value
760  * for the high part of the spte.  The race is fine for a present->non-present
761  * change (because the high part of the spte is ignored for non-present spte),
762  * but for a present->present change we must reread the spte.
763  *
764  * All such changes are done in two steps (present->non-present and
765  * non-present->present), hence it is enough to count the number of
766  * present->non-present updates: if it changed while reading the spte,
767  * we might have hit the race.  This is done using clear_spte_count.
768  */
769 static u64 __get_spte_lockless(u64 *sptep)
770 {
771         struct kvm_mmu_page *sp =  page_header(__pa(sptep));
772         union split_spte spte, *orig = (union split_spte *)sptep;
773         int count;
774
775 retry:
776         count = sp->clear_spte_count;
777         smp_rmb();
778
779         spte.spte_low = orig->spte_low;
780         smp_rmb();
781
782         spte.spte_high = orig->spte_high;
783         smp_rmb();
784
785         if (unlikely(spte.spte_low != orig->spte_low ||
786               count != sp->clear_spte_count))
787                 goto retry;
788
789         return spte.spte;
790 }
791 #endif
792
793 static bool spte_can_locklessly_be_made_writable(u64 spte)
794 {
795         return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
796                 (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
797 }
798
799 static bool spte_has_volatile_bits(u64 spte)
800 {
801         if (!is_shadow_present_pte(spte))
802                 return false;
803
804         /*
805          * Always atomically update spte if it can be updated
806          * out of mmu-lock, it can ensure dirty bit is not lost,
807          * also, it can help us to get a stable is_writable_pte()
808          * to ensure tlb flush is not missed.
809          */
810         if (spte_can_locklessly_be_made_writable(spte) ||
811             is_access_track_spte(spte))
812                 return true;
813
814         if (spte_ad_enabled(spte)) {
815                 if ((spte & shadow_accessed_mask) == 0 ||
816                     (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
817                         return true;
818         }
819
820         return false;
821 }
822
823 static bool is_accessed_spte(u64 spte)
824 {
825         u64 accessed_mask = spte_shadow_accessed_mask(spte);
826
827         return accessed_mask ? spte & accessed_mask
828                              : !is_access_track_spte(spte);
829 }
830
831 static bool is_dirty_spte(u64 spte)
832 {
833         u64 dirty_mask = spte_shadow_dirty_mask(spte);
834
835         return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
836 }
837
838 /* Rules for using mmu_spte_set:
839  * Set the sptep from nonpresent to present.
840  * Note: the sptep being assigned *must* be either not present
841  * or in a state where the hardware will not attempt to update
842  * the spte.
843  */
844 static void mmu_spte_set(u64 *sptep, u64 new_spte)
845 {
846         WARN_ON(is_shadow_present_pte(*sptep));
847         __set_spte(sptep, new_spte);
848 }
849
850 /*
851  * Update the SPTE (excluding the PFN), but do not track changes in its
852  * accessed/dirty status.
853  */
854 static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
855 {
856         u64 old_spte = *sptep;
857
858         WARN_ON(!is_shadow_present_pte(new_spte));
859
860         if (!is_shadow_present_pte(old_spte)) {
861                 mmu_spte_set(sptep, new_spte);
862                 return old_spte;
863         }
864
865         if (!spte_has_volatile_bits(old_spte))
866                 __update_clear_spte_fast(sptep, new_spte);
867         else
868                 old_spte = __update_clear_spte_slow(sptep, new_spte);
869
870         WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
871
872         return old_spte;
873 }
874
875 /* Rules for using mmu_spte_update:
876  * Update the state bits, it means the mapped pfn is not changed.
877  *
878  * Whenever we overwrite a writable spte with a read-only one we
879  * should flush remote TLBs. Otherwise rmap_write_protect
880  * will find a read-only spte, even though the writable spte
881  * might be cached on a CPU's TLB, the return value indicates this
882  * case.
883  *
884  * Returns true if the TLB needs to be flushed
885  */
886 static bool mmu_spte_update(u64 *sptep, u64 new_spte)
887 {
888         bool flush = false;
889         u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
890
891         if (!is_shadow_present_pte(old_spte))
892                 return false;
893
894         /*
895          * For the spte updated out of mmu-lock is safe, since
896          * we always atomically update it, see the comments in
897          * spte_has_volatile_bits().
898          */
899         if (spte_can_locklessly_be_made_writable(old_spte) &&
900               !is_writable_pte(new_spte))
901                 flush = true;
902
903         /*
904          * Flush TLB when accessed/dirty states are changed in the page tables,
905          * to guarantee consistency between TLB and page tables.
906          */
907
908         if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
909                 flush = true;
910                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
911         }
912
913         if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
914                 flush = true;
915                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
916         }
917
918         return flush;
919 }
920
921 /*
922  * Rules for using mmu_spte_clear_track_bits:
923  * It sets the sptep from present to nonpresent, and track the
924  * state bits, it is used to clear the last level sptep.
925  * Returns non-zero if the PTE was previously valid.
926  */
927 static int mmu_spte_clear_track_bits(u64 *sptep)
928 {
929         kvm_pfn_t pfn;
930         u64 old_spte = *sptep;
931
932         if (!spte_has_volatile_bits(old_spte))
933                 __update_clear_spte_fast(sptep, 0ull);
934         else
935                 old_spte = __update_clear_spte_slow(sptep, 0ull);
936
937         if (!is_shadow_present_pte(old_spte))
938                 return 0;
939
940         pfn = spte_to_pfn(old_spte);
941
942         /*
943          * KVM does not hold the refcount of the page used by
944          * kvm mmu, before reclaiming the page, we should
945          * unmap it from mmu first.
946          */
947         WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
948
949         if (is_accessed_spte(old_spte))
950                 kvm_set_pfn_accessed(pfn);
951
952         if (is_dirty_spte(old_spte))
953                 kvm_set_pfn_dirty(pfn);
954
955         return 1;
956 }
957
958 /*
959  * Rules for using mmu_spte_clear_no_track:
960  * Directly clear spte without caring the state bits of sptep,
961  * it is used to set the upper level spte.
962  */
963 static void mmu_spte_clear_no_track(u64 *sptep)
964 {
965         __update_clear_spte_fast(sptep, 0ull);
966 }
967
968 static u64 mmu_spte_get_lockless(u64 *sptep)
969 {
970         return __get_spte_lockless(sptep);
971 }
972
973 static u64 mark_spte_for_access_track(u64 spte)
974 {
975         if (spte_ad_enabled(spte))
976                 return spte & ~shadow_accessed_mask;
977
978         if (is_access_track_spte(spte))
979                 return spte;
980
981         /*
982          * Making an Access Tracking PTE will result in removal of write access
983          * from the PTE. So, verify that we will be able to restore the write
984          * access in the fast page fault path later on.
985          */
986         WARN_ONCE((spte & PT_WRITABLE_MASK) &&
987                   !spte_can_locklessly_be_made_writable(spte),
988                   "kvm: Writable SPTE is not locklessly dirty-trackable\n");
989
990         WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
991                           shadow_acc_track_saved_bits_shift),
992                   "kvm: Access Tracking saved bit locations are not zero\n");
993
994         spte |= (spte & shadow_acc_track_saved_bits_mask) <<
995                 shadow_acc_track_saved_bits_shift;
996         spte &= ~shadow_acc_track_mask;
997
998         return spte;
999 }
1000
1001 /* Restore an acc-track PTE back to a regular PTE */
1002 static u64 restore_acc_track_spte(u64 spte)
1003 {
1004         u64 new_spte = spte;
1005         u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
1006                          & shadow_acc_track_saved_bits_mask;
1007
1008         WARN_ON_ONCE(spte_ad_enabled(spte));
1009         WARN_ON_ONCE(!is_access_track_spte(spte));
1010
1011         new_spte &= ~shadow_acc_track_mask;
1012         new_spte &= ~(shadow_acc_track_saved_bits_mask <<
1013                       shadow_acc_track_saved_bits_shift);
1014         new_spte |= saved_bits;
1015
1016         return new_spte;
1017 }
1018
1019 /* Returns the Accessed status of the PTE and resets it at the same time. */
1020 static bool mmu_spte_age(u64 *sptep)
1021 {
1022         u64 spte = mmu_spte_get_lockless(sptep);
1023
1024         if (!is_accessed_spte(spte))
1025                 return false;
1026
1027         if (spte_ad_enabled(spte)) {
1028                 clear_bit((ffs(shadow_accessed_mask) - 1),
1029                           (unsigned long *)sptep);
1030         } else {
1031                 /*
1032                  * Capture the dirty status of the page, so that it doesn't get
1033                  * lost when the SPTE is marked for access tracking.
1034                  */
1035                 if (is_writable_pte(spte))
1036                         kvm_set_pfn_dirty(spte_to_pfn(spte));
1037
1038                 spte = mark_spte_for_access_track(spte);
1039                 mmu_spte_update_no_track(sptep, spte);
1040         }
1041
1042         return true;
1043 }
1044
1045 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
1046 {
1047         /*
1048          * Prevent page table teardown by making any free-er wait during
1049          * kvm_flush_remote_tlbs() IPI to all active vcpus.
1050          */
1051         local_irq_disable();
1052
1053         /*
1054          * Make sure a following spte read is not reordered ahead of the write
1055          * to vcpu->mode.
1056          */
1057         smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
1058 }
1059
1060 static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
1061 {
1062         /*
1063          * Make sure the write to vcpu->mode is not reordered in front of
1064          * reads to sptes.  If it does, kvm_mmu_commit_zap_page() can see us
1065          * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
1066          */
1067         smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
1068         local_irq_enable();
1069 }
1070
1071 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
1072                                   struct kmem_cache *base_cache, int min)
1073 {
1074         void *obj;
1075
1076         if (cache->nobjs >= min)
1077                 return 0;
1078         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
1079                 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT);
1080                 if (!obj)
1081                         return cache->nobjs >= min ? 0 : -ENOMEM;
1082                 cache->objects[cache->nobjs++] = obj;
1083         }
1084         return 0;
1085 }
1086
1087 static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
1088 {
1089         return cache->nobjs;
1090 }
1091
1092 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
1093                                   struct kmem_cache *cache)
1094 {
1095         while (mc->nobjs)
1096                 kmem_cache_free(cache, mc->objects[--mc->nobjs]);
1097 }
1098
1099 static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
1100                                        int min)
1101 {
1102         void *page;
1103
1104         if (cache->nobjs >= min)
1105                 return 0;
1106         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
1107                 page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
1108                 if (!page)
1109                         return cache->nobjs >= min ? 0 : -ENOMEM;
1110                 cache->objects[cache->nobjs++] = page;
1111         }
1112         return 0;
1113 }
1114
1115 static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
1116 {
1117         while (mc->nobjs)
1118                 free_page((unsigned long)mc->objects[--mc->nobjs]);
1119 }
1120
1121 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
1122 {
1123         int r;
1124
1125         r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
1126                                    pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
1127         if (r)
1128                 goto out;
1129         r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
1130         if (r)
1131                 goto out;
1132         r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
1133                                    mmu_page_header_cache, 4);
1134 out:
1135         return r;
1136 }
1137
1138 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
1139 {
1140         mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
1141                                 pte_list_desc_cache);
1142         mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
1143         mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
1144                                 mmu_page_header_cache);
1145 }
1146
1147 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
1148 {
1149         void *p;
1150
1151         BUG_ON(!mc->nobjs);
1152         p = mc->objects[--mc->nobjs];
1153         return p;
1154 }
1155
1156 static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
1157 {
1158         return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
1159 }
1160
1161 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
1162 {
1163         kmem_cache_free(pte_list_desc_cache, pte_list_desc);
1164 }
1165
1166 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
1167 {
1168         if (!sp->role.direct)
1169                 return sp->gfns[index];
1170
1171         return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
1172 }
1173
1174 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
1175 {
1176         if (!sp->role.direct) {
1177                 sp->gfns[index] = gfn;
1178                 return;
1179         }
1180
1181         if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
1182                 pr_err_ratelimited("gfn mismatch under direct page %llx "
1183                                    "(expected %llx, got %llx)\n",
1184                                    sp->gfn,
1185                                    kvm_mmu_page_get_gfn(sp, index), gfn);
1186 }
1187
1188 /*
1189  * Return the pointer to the large page information for a given gfn,
1190  * handling slots that are not large page aligned.
1191  */
1192 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
1193                                               struct kvm_memory_slot *slot,
1194                                               int level)
1195 {
1196         unsigned long idx;
1197
1198         idx = gfn_to_index(gfn, slot->base_gfn, level);
1199         return &slot->arch.lpage_info[level - 2][idx];
1200 }
1201
1202 static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
1203                                             gfn_t gfn, int count)
1204 {
1205         struct kvm_lpage_info *linfo;
1206         int i;
1207
1208         for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1209                 linfo = lpage_info_slot(gfn, slot, i);
1210                 linfo->disallow_lpage += count;
1211                 WARN_ON(linfo->disallow_lpage < 0);
1212         }
1213 }
1214
1215 void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
1216 {
1217         update_gfn_disallow_lpage_count(slot, gfn, 1);
1218 }
1219
1220 void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
1221 {
1222         update_gfn_disallow_lpage_count(slot, gfn, -1);
1223 }
1224
1225 static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1226 {
1227         struct kvm_memslots *slots;
1228         struct kvm_memory_slot *slot;
1229         gfn_t gfn;
1230
1231         kvm->arch.indirect_shadow_pages++;
1232         gfn = sp->gfn;
1233         slots = kvm_memslots_for_spte_role(kvm, sp->role);
1234         slot = __gfn_to_memslot(slots, gfn);
1235
1236         /* the non-leaf shadow pages are keeping readonly. */
1237         if (sp->role.level > PT_PAGE_TABLE_LEVEL)
1238                 return kvm_slot_page_track_add_page(kvm, slot, gfn,
1239                                                     KVM_PAGE_TRACK_WRITE);
1240
1241         kvm_mmu_gfn_disallow_lpage(slot, gfn);
1242 }
1243
1244 static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1245 {
1246         if (sp->lpage_disallowed)
1247                 return;
1248
1249         ++kvm->stat.nx_lpage_splits;
1250         list_add_tail(&sp->lpage_disallowed_link,
1251                       &kvm->arch.lpage_disallowed_mmu_pages);
1252         sp->lpage_disallowed = true;
1253 }
1254
1255 static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1256 {
1257         struct kvm_memslots *slots;
1258         struct kvm_memory_slot *slot;
1259         gfn_t gfn;
1260
1261         kvm->arch.indirect_shadow_pages--;
1262         gfn = sp->gfn;
1263         slots = kvm_memslots_for_spte_role(kvm, sp->role);
1264         slot = __gfn_to_memslot(slots, gfn);
1265         if (sp->role.level > PT_PAGE_TABLE_LEVEL)
1266                 return kvm_slot_page_track_remove_page(kvm, slot, gfn,
1267                                                        KVM_PAGE_TRACK_WRITE);
1268
1269         kvm_mmu_gfn_allow_lpage(slot, gfn);
1270 }
1271
1272 static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1273 {
1274         --kvm->stat.nx_lpage_splits;
1275         sp->lpage_disallowed = false;
1276         list_del(&sp->lpage_disallowed_link);
1277 }
1278
1279 static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
1280                                           struct kvm_memory_slot *slot)
1281 {
1282         struct kvm_lpage_info *linfo;
1283
1284         if (slot) {
1285                 linfo = lpage_info_slot(gfn, slot, level);
1286                 return !!linfo->disallow_lpage;
1287         }
1288
1289         return true;
1290 }
1291
1292 static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
1293                                         int level)
1294 {
1295         struct kvm_memory_slot *slot;
1296
1297         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1298         return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
1299 }
1300
1301 static int host_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn)
1302 {
1303         unsigned long page_size;
1304         int i, ret = 0;
1305
1306         page_size = kvm_host_page_size(vcpu, gfn);
1307
1308         for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1309                 if (page_size >= KVM_HPAGE_SIZE(i))
1310                         ret = i;
1311                 else
1312                         break;
1313         }
1314
1315         return ret;
1316 }
1317
1318 static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
1319                                           bool no_dirty_log)
1320 {
1321         if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
1322                 return false;
1323         if (no_dirty_log && slot->dirty_bitmap)
1324                 return false;
1325
1326         return true;
1327 }
1328
1329 static struct kvm_memory_slot *
1330 gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
1331                             bool no_dirty_log)
1332 {
1333         struct kvm_memory_slot *slot;
1334
1335         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1336         if (!memslot_valid_for_gpte(slot, no_dirty_log))
1337                 slot = NULL;
1338
1339         return slot;
1340 }
1341
1342 static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
1343                          bool *force_pt_level)
1344 {
1345         int host_level, level, max_level;
1346         struct kvm_memory_slot *slot;
1347
1348         if (unlikely(*force_pt_level))
1349                 return PT_PAGE_TABLE_LEVEL;
1350
1351         slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
1352         *force_pt_level = !memslot_valid_for_gpte(slot, true);
1353         if (unlikely(*force_pt_level))
1354                 return PT_PAGE_TABLE_LEVEL;
1355
1356         host_level = host_mapping_level(vcpu, large_gfn);
1357
1358         if (host_level == PT_PAGE_TABLE_LEVEL)
1359                 return host_level;
1360
1361         max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
1362
1363         for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
1364                 if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
1365                         break;
1366
1367         return level - 1;
1368 }
1369
1370 /*
1371  * About rmap_head encoding:
1372  *
1373  * If the bit zero of rmap_head->val is clear, then it points to the only spte
1374  * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
1375  * pte_list_desc containing more mappings.
1376  */
1377
1378 /*
1379  * Returns the number of pointers in the rmap chain, not counting the new one.
1380  */
1381 static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
1382                         struct kvm_rmap_head *rmap_head)
1383 {
1384         struct pte_list_desc *desc;
1385         int i, count = 0;
1386
1387         if (!rmap_head->val) {
1388                 rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
1389                 rmap_head->val = (unsigned long)spte;
1390         } else if (!(rmap_head->val & 1)) {
1391                 rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
1392                 desc = mmu_alloc_pte_list_desc(vcpu);
1393                 desc->sptes[0] = (u64 *)rmap_head->val;
1394                 desc->sptes[1] = spte;
1395                 rmap_head->val = (unsigned long)desc | 1;
1396                 ++count;
1397         } else {
1398                 rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
1399                 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1400                 while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
1401                         desc = desc->more;
1402                         count += PTE_LIST_EXT;
1403                 }
1404                 if (desc->sptes[PTE_LIST_EXT-1]) {
1405                         desc->more = mmu_alloc_pte_list_desc(vcpu);
1406                         desc = desc->more;
1407                 }
1408                 for (i = 0; desc->sptes[i]; ++i)
1409                         ++count;
1410                 desc->sptes[i] = spte;
1411         }
1412         return count;
1413 }
1414
1415 static void
1416 pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
1417                            struct pte_list_desc *desc, int i,
1418                            struct pte_list_desc *prev_desc)
1419 {
1420         int j;
1421
1422         for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
1423                 ;
1424         desc->sptes[i] = desc->sptes[j];
1425         desc->sptes[j] = NULL;
1426         if (j != 0)
1427                 return;
1428         if (!prev_desc && !desc->more)
1429                 rmap_head->val = (unsigned long)desc->sptes[0];
1430         else
1431                 if (prev_desc)
1432                         prev_desc->more = desc->more;
1433                 else
1434                         rmap_head->val = (unsigned long)desc->more | 1;
1435         mmu_free_pte_list_desc(desc);
1436 }
1437
1438 static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
1439 {
1440         struct pte_list_desc *desc;
1441         struct pte_list_desc *prev_desc;
1442         int i;
1443
1444         if (!rmap_head->val) {
1445                 pr_err("%s: %p 0->BUG\n", __func__, spte);
1446                 BUG();
1447         } else if (!(rmap_head->val & 1)) {
1448                 rmap_printk("%s:  %p 1->0\n", __func__, spte);
1449                 if ((u64 *)rmap_head->val != spte) {
1450                         pr_err("%s:  %p 1->BUG\n", __func__, spte);
1451                         BUG();
1452                 }
1453                 rmap_head->val = 0;
1454         } else {
1455                 rmap_printk("%s:  %p many->many\n", __func__, spte);
1456                 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1457                 prev_desc = NULL;
1458                 while (desc) {
1459                         for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
1460                                 if (desc->sptes[i] == spte) {
1461                                         pte_list_desc_remove_entry(rmap_head,
1462                                                         desc, i, prev_desc);
1463                                         return;
1464                                 }
1465                         }
1466                         prev_desc = desc;
1467                         desc = desc->more;
1468                 }
1469                 pr_err("%s: %p many->many\n", __func__, spte);
1470                 BUG();
1471         }
1472 }
1473
1474 static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep)
1475 {
1476         mmu_spte_clear_track_bits(sptep);
1477         __pte_list_remove(sptep, rmap_head);
1478 }
1479
1480 static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
1481                                            struct kvm_memory_slot *slot)
1482 {
1483         unsigned long idx;
1484
1485         idx = gfn_to_index(gfn, slot->base_gfn, level);
1486         return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
1487 }
1488
1489 static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
1490                                          struct kvm_mmu_page *sp)
1491 {
1492         struct kvm_memslots *slots;
1493         struct kvm_memory_slot *slot;
1494
1495         slots = kvm_memslots_for_spte_role(kvm, sp->role);
1496         slot = __gfn_to_memslot(slots, gfn);
1497         return __gfn_to_rmap(gfn, sp->role.level, slot);
1498 }
1499
1500 static bool rmap_can_add(struct kvm_vcpu *vcpu)
1501 {
1502         struct kvm_mmu_memory_cache *cache;
1503
1504         cache = &vcpu->arch.mmu_pte_list_desc_cache;
1505         return mmu_memory_cache_free_objects(cache);
1506 }
1507
1508 static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
1509 {
1510         struct kvm_mmu_page *sp;
1511         struct kvm_rmap_head *rmap_head;
1512
1513         sp = page_header(__pa(spte));
1514         kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
1515         rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
1516         return pte_list_add(vcpu, spte, rmap_head);
1517 }
1518
1519 static void rmap_remove(struct kvm *kvm, u64 *spte)
1520 {
1521         struct kvm_mmu_page *sp;
1522         gfn_t gfn;
1523         struct kvm_rmap_head *rmap_head;
1524
1525         sp = page_header(__pa(spte));
1526         gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
1527         rmap_head = gfn_to_rmap(kvm, gfn, sp);
1528         __pte_list_remove(spte, rmap_head);
1529 }
1530
1531 /*
1532  * Used by the following functions to iterate through the sptes linked by a
1533  * rmap.  All fields are private and not assumed to be used outside.
1534  */
1535 struct rmap_iterator {
1536         /* private fields */
1537         struct pte_list_desc *desc;     /* holds the sptep if not NULL */
1538         int pos;                        /* index of the sptep */
1539 };
1540
1541 /*
1542  * Iteration must be started by this function.  This should also be used after
1543  * removing/dropping sptes from the rmap link because in such cases the
1544  * information in the itererator may not be valid.
1545  *
1546  * Returns sptep if found, NULL otherwise.
1547  */
1548 static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
1549                            struct rmap_iterator *iter)
1550 {
1551         u64 *sptep;
1552
1553         if (!rmap_head->val)
1554                 return NULL;
1555
1556         if (!(rmap_head->val & 1)) {
1557                 iter->desc = NULL;
1558                 sptep = (u64 *)rmap_head->val;
1559                 goto out;
1560         }
1561
1562         iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1563         iter->pos = 0;
1564         sptep = iter->desc->sptes[iter->pos];
1565 out:
1566         BUG_ON(!is_shadow_present_pte(*sptep));
1567         return sptep;
1568 }
1569
1570 /*
1571  * Must be used with a valid iterator: e.g. after rmap_get_first().
1572  *
1573  * Returns sptep if found, NULL otherwise.
1574  */
1575 static u64 *rmap_get_next(struct rmap_iterator *iter)
1576 {
1577         u64 *sptep;
1578
1579         if (iter->desc) {
1580                 if (iter->pos < PTE_LIST_EXT - 1) {
1581                         ++iter->pos;
1582                         sptep = iter->desc->sptes[iter->pos];
1583                         if (sptep)
1584                                 goto out;
1585                 }
1586
1587                 iter->desc = iter->desc->more;
1588
1589                 if (iter->desc) {
1590                         iter->pos = 0;
1591                         /* desc->sptes[0] cannot be NULL */
1592                         sptep = iter->desc->sptes[iter->pos];
1593                         goto out;
1594                 }
1595         }
1596
1597         return NULL;
1598 out:
1599         BUG_ON(!is_shadow_present_pte(*sptep));
1600         return sptep;
1601 }
1602
1603 #define for_each_rmap_spte(_rmap_head_, _iter_, _spte_)                 \
1604         for (_spte_ = rmap_get_first(_rmap_head_, _iter_);              \
1605              _spte_; _spte_ = rmap_get_next(_iter_))
1606
1607 static void drop_spte(struct kvm *kvm, u64 *sptep)
1608 {
1609         if (mmu_spte_clear_track_bits(sptep))
1610                 rmap_remove(kvm, sptep);
1611 }
1612
1613
1614 static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
1615 {
1616         if (is_large_pte(*sptep)) {
1617                 WARN_ON(page_header(__pa(sptep))->role.level ==
1618                         PT_PAGE_TABLE_LEVEL);
1619                 drop_spte(kvm, sptep);
1620                 --kvm->stat.lpages;
1621                 return true;
1622         }
1623
1624         return false;
1625 }
1626
1627 static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1628 {
1629         if (__drop_large_spte(vcpu->kvm, sptep)) {
1630                 struct kvm_mmu_page *sp = page_header(__pa(sptep));
1631
1632                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1633                         KVM_PAGES_PER_HPAGE(sp->role.level));
1634         }
1635 }
1636
1637 /*
1638  * Write-protect on the specified @sptep, @pt_protect indicates whether
1639  * spte write-protection is caused by protecting shadow page table.
1640  *
1641  * Note: write protection is difference between dirty logging and spte
1642  * protection:
1643  * - for dirty logging, the spte can be set to writable at anytime if
1644  *   its dirty bitmap is properly set.
1645  * - for spte protection, the spte can be writable only after unsync-ing
1646  *   shadow page.
1647  *
1648  * Return true if tlb need be flushed.
1649  */
1650 static bool spte_write_protect(u64 *sptep, bool pt_protect)
1651 {
1652         u64 spte = *sptep;
1653
1654         if (!is_writable_pte(spte) &&
1655               !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
1656                 return false;
1657
1658         rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
1659
1660         if (pt_protect)
1661                 spte &= ~SPTE_MMU_WRITEABLE;
1662         spte = spte & ~PT_WRITABLE_MASK;
1663
1664         return mmu_spte_update(sptep, spte);
1665 }
1666
1667 static bool __rmap_write_protect(struct kvm *kvm,
1668                                  struct kvm_rmap_head *rmap_head,
1669                                  bool pt_protect)
1670 {
1671         u64 *sptep;
1672         struct rmap_iterator iter;
1673         bool flush = false;
1674
1675         for_each_rmap_spte(rmap_head, &iter, sptep)
1676                 flush |= spte_write_protect(sptep, pt_protect);
1677
1678         return flush;
1679 }
1680
1681 static bool spte_clear_dirty(u64 *sptep)
1682 {
1683         u64 spte = *sptep;
1684
1685         rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
1686
1687         MMU_WARN_ON(!spte_ad_enabled(spte));
1688         spte &= ~shadow_dirty_mask;
1689         return mmu_spte_update(sptep, spte);
1690 }
1691
1692 static bool spte_wrprot_for_clear_dirty(u64 *sptep)
1693 {
1694         bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
1695                                                (unsigned long *)sptep);
1696         if (was_writable && !spte_ad_enabled(*sptep))
1697                 kvm_set_pfn_dirty(spte_to_pfn(*sptep));
1698
1699         return was_writable;
1700 }
1701
1702 /*
1703  * Gets the GFN ready for another round of dirty logging by clearing the
1704  *      - D bit on ad-enabled SPTEs, and
1705  *      - W bit on ad-disabled SPTEs.
1706  * Returns true iff any D or W bits were cleared.
1707  */
1708 static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1709 {
1710         u64 *sptep;
1711         struct rmap_iterator iter;
1712         bool flush = false;
1713
1714         for_each_rmap_spte(rmap_head, &iter, sptep)
1715                 if (spte_ad_need_write_protect(*sptep))
1716                         flush |= spte_wrprot_for_clear_dirty(sptep);
1717                 else
1718                         flush |= spte_clear_dirty(sptep);
1719
1720         return flush;
1721 }
1722
1723 static bool spte_set_dirty(u64 *sptep)
1724 {
1725         u64 spte = *sptep;
1726
1727         rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
1728
1729         /*
1730          * Similar to the !kvm_x86_ops->slot_disable_log_dirty case,
1731          * do not bother adding back write access to pages marked
1732          * SPTE_AD_WRPROT_ONLY_MASK.
1733          */
1734         spte |= shadow_dirty_mask;
1735
1736         return mmu_spte_update(sptep, spte);
1737 }
1738
1739 static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1740 {
1741         u64 *sptep;
1742         struct rmap_iterator iter;
1743         bool flush = false;
1744
1745         for_each_rmap_spte(rmap_head, &iter, sptep)
1746                 if (spte_ad_enabled(*sptep))
1747                         flush |= spte_set_dirty(sptep);
1748
1749         return flush;
1750 }
1751
1752 /**
1753  * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
1754  * @kvm: kvm instance
1755  * @slot: slot to protect
1756  * @gfn_offset: start of the BITS_PER_LONG pages we care about
1757  * @mask: indicates which pages we should protect
1758  *
1759  * Used when we do not need to care about huge page mappings: e.g. during dirty
1760  * logging we do not have any such mappings.
1761  */
1762 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1763                                      struct kvm_memory_slot *slot,
1764                                      gfn_t gfn_offset, unsigned long mask)
1765 {
1766         struct kvm_rmap_head *rmap_head;
1767
1768         while (mask) {
1769                 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1770                                           PT_PAGE_TABLE_LEVEL, slot);
1771                 __rmap_write_protect(kvm, rmap_head, false);
1772
1773                 /* clear the first set bit */
1774                 mask &= mask - 1;
1775         }
1776 }
1777
1778 /**
1779  * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
1780  * protect the page if the D-bit isn't supported.
1781  * @kvm: kvm instance
1782  * @slot: slot to clear D-bit
1783  * @gfn_offset: start of the BITS_PER_LONG pages we care about
1784  * @mask: indicates which pages we should clear D-bit
1785  *
1786  * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
1787  */
1788 void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1789                                      struct kvm_memory_slot *slot,
1790                                      gfn_t gfn_offset, unsigned long mask)
1791 {
1792         struct kvm_rmap_head *rmap_head;
1793
1794         while (mask) {
1795                 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1796                                           PT_PAGE_TABLE_LEVEL, slot);
1797                 __rmap_clear_dirty(kvm, rmap_head);
1798
1799                 /* clear the first set bit */
1800                 mask &= mask - 1;
1801         }
1802 }
1803 EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
1804
1805 /**
1806  * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1807  * PT level pages.
1808  *
1809  * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1810  * enable dirty logging for them.
1811  *
1812  * Used when we do not need to care about huge page mappings: e.g. during dirty
1813  * logging we do not have any such mappings.
1814  */
1815 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1816                                 struct kvm_memory_slot *slot,
1817                                 gfn_t gfn_offset, unsigned long mask)
1818 {
1819         if (kvm_x86_ops->enable_log_dirty_pt_masked)
1820                 kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
1821                                 mask);
1822         else
1823                 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1824 }
1825
1826 /**
1827  * kvm_arch_write_log_dirty - emulate dirty page logging
1828  * @vcpu: Guest mode vcpu
1829  *
1830  * Emulate arch specific page modification logging for the
1831  * nested hypervisor
1832  */
1833 int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa)
1834 {
1835         if (kvm_x86_ops->write_log_dirty)
1836                 return kvm_x86_ops->write_log_dirty(vcpu, l2_gpa);
1837
1838         return 0;
1839 }
1840
1841 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
1842                                     struct kvm_memory_slot *slot, u64 gfn)
1843 {
1844         struct kvm_rmap_head *rmap_head;
1845         int i;
1846         bool write_protected = false;
1847
1848         for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1849                 rmap_head = __gfn_to_rmap(gfn, i, slot);
1850                 write_protected |= __rmap_write_protect(kvm, rmap_head, true);
1851         }
1852
1853         return write_protected;
1854 }
1855
1856 static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
1857 {
1858         struct kvm_memory_slot *slot;
1859
1860         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1861         return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
1862 }
1863
1864 static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1865 {
1866         u64 *sptep;
1867         struct rmap_iterator iter;
1868         bool flush = false;
1869
1870         while ((sptep = rmap_get_first(rmap_head, &iter))) {
1871                 rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
1872
1873                 pte_list_remove(rmap_head, sptep);
1874                 flush = true;
1875         }
1876
1877         return flush;
1878 }
1879
1880 static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1881                            struct kvm_memory_slot *slot, gfn_t gfn, int level,
1882                            unsigned long data)
1883 {
1884         return kvm_zap_rmapp(kvm, rmap_head);
1885 }
1886
1887 static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1888                              struct kvm_memory_slot *slot, gfn_t gfn, int level,
1889                              unsigned long data)
1890 {
1891         u64 *sptep;
1892         struct rmap_iterator iter;
1893         int need_flush = 0;
1894         u64 new_spte;
1895         pte_t *ptep = (pte_t *)data;
1896         kvm_pfn_t new_pfn;
1897
1898         WARN_ON(pte_huge(*ptep));
1899         new_pfn = pte_pfn(*ptep);
1900
1901 restart:
1902         for_each_rmap_spte(rmap_head, &iter, sptep) {
1903                 rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
1904                             sptep, *sptep, gfn, level);
1905
1906                 need_flush = 1;
1907
1908                 if (pte_write(*ptep)) {
1909                         pte_list_remove(rmap_head, sptep);
1910                         goto restart;
1911                 } else {
1912                         new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
1913                         new_spte |= (u64)new_pfn << PAGE_SHIFT;
1914
1915                         new_spte &= ~PT_WRITABLE_MASK;
1916                         new_spte &= ~SPTE_HOST_WRITEABLE;
1917
1918                         new_spte = mark_spte_for_access_track(new_spte);
1919
1920                         mmu_spte_clear_track_bits(sptep);
1921                         mmu_spte_set(sptep, new_spte);
1922                 }
1923         }
1924
1925         if (need_flush && kvm_available_flush_tlb_with_range()) {
1926                 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1927                 return 0;
1928         }
1929
1930         return need_flush;
1931 }
1932
1933 struct slot_rmap_walk_iterator {
1934         /* input fields. */
1935         struct kvm_memory_slot *slot;
1936         gfn_t start_gfn;
1937         gfn_t end_gfn;
1938         int start_level;
1939         int end_level;
1940
1941         /* output fields. */
1942         gfn_t gfn;
1943         struct kvm_rmap_head *rmap;
1944         int level;
1945
1946         /* private field. */
1947         struct kvm_rmap_head *end_rmap;
1948 };
1949
1950 static void
1951 rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
1952 {
1953         iterator->level = level;
1954         iterator->gfn = iterator->start_gfn;
1955         iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
1956         iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
1957                                            iterator->slot);
1958 }
1959
1960 static void
1961 slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
1962                     struct kvm_memory_slot *slot, int start_level,
1963                     int end_level, gfn_t start_gfn, gfn_t end_gfn)
1964 {
1965         iterator->slot = slot;
1966         iterator->start_level = start_level;
1967         iterator->end_level = end_level;
1968         iterator->start_gfn = start_gfn;
1969         iterator->end_gfn = end_gfn;
1970
1971         rmap_walk_init_level(iterator, iterator->start_level);
1972 }
1973
1974 static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
1975 {
1976         return !!iterator->rmap;
1977 }
1978
1979 static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
1980 {
1981         if (++iterator->rmap <= iterator->end_rmap) {
1982                 iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
1983                 return;
1984         }
1985
1986         if (++iterator->level > iterator->end_level) {
1987                 iterator->rmap = NULL;
1988                 return;
1989         }
1990
1991         rmap_walk_init_level(iterator, iterator->level);
1992 }
1993
1994 #define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_,    \
1995            _start_gfn, _end_gfn, _iter_)                                \
1996         for (slot_rmap_walk_init(_iter_, _slot_, _start_level_,         \
1997                                  _end_level_, _start_gfn, _end_gfn);    \
1998              slot_rmap_walk_okay(_iter_);                               \
1999              slot_rmap_walk_next(_iter_))
2000
2001 static int kvm_handle_hva_range(struct kvm *kvm,
2002                                 unsigned long start,
2003                                 unsigned long end,
2004                                 unsigned long data,
2005                                 int (*handler)(struct kvm *kvm,
2006                                                struct kvm_rmap_head *rmap_head,
2007                                                struct kvm_memory_slot *slot,
2008                                                gfn_t gfn,
2009                                                int level,
2010                                                unsigned long data))
2011 {
2012         struct kvm_memslots *slots;
2013         struct kvm_memory_slot *memslot;
2014         struct slot_rmap_walk_iterator iterator;
2015         int ret = 0;
2016         int i;
2017
2018         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
2019                 slots = __kvm_memslots(kvm, i);
2020                 kvm_for_each_memslot(memslot, slots) {
2021                         unsigned long hva_start, hva_end;
2022                         gfn_t gfn_start, gfn_end;
2023
2024                         hva_start = max(start, memslot->userspace_addr);
2025                         hva_end = min(end, memslot->userspace_addr +
2026                                       (memslot->npages << PAGE_SHIFT));
2027                         if (hva_start >= hva_end)
2028                                 continue;
2029                         /*
2030                          * {gfn(page) | page intersects with [hva_start, hva_end)} =
2031                          * {gfn_start, gfn_start+1, ..., gfn_end-1}.
2032                          */
2033                         gfn_start = hva_to_gfn_memslot(hva_start, memslot);
2034                         gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
2035
2036                         for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
2037                                                  PT_MAX_HUGEPAGE_LEVEL,
2038                                                  gfn_start, gfn_end - 1,
2039                                                  &iterator)
2040                                 ret |= handler(kvm, iterator.rmap, memslot,
2041                                                iterator.gfn, iterator.level, data);
2042                 }
2043         }
2044
2045         return ret;
2046 }
2047
2048 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
2049                           unsigned long data,
2050                           int (*handler)(struct kvm *kvm,
2051                                          struct kvm_rmap_head *rmap_head,
2052                                          struct kvm_memory_slot *slot,
2053                                          gfn_t gfn, int level,
2054                                          unsigned long data))
2055 {
2056         return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
2057 }
2058
2059 int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
2060                         unsigned flags)
2061 {
2062         return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
2063 }
2064
2065 int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
2066 {
2067         return kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
2068 }
2069
2070 static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
2071                          struct kvm_memory_slot *slot, gfn_t gfn, int level,
2072                          unsigned long data)
2073 {
2074         u64 *sptep;
2075         struct rmap_iterator uninitialized_var(iter);
2076         int young = 0;
2077
2078         for_each_rmap_spte(rmap_head, &iter, sptep)
2079                 young |= mmu_spte_age(sptep);
2080
2081         trace_kvm_age_page(gfn, level, slot, young);
2082         return young;
2083 }
2084
2085 static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
2086                               struct kvm_memory_slot *slot, gfn_t gfn,
2087                               int level, unsigned long data)
2088 {
2089         u64 *sptep;
2090         struct rmap_iterator iter;
2091
2092         for_each_rmap_spte(rmap_head, &iter, sptep)
2093                 if (is_accessed_spte(*sptep))
2094                         return 1;
2095         return 0;
2096 }
2097
2098 #define RMAP_RECYCLE_THRESHOLD 1000
2099
2100 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
2101 {
2102         struct kvm_rmap_head *rmap_head;
2103         struct kvm_mmu_page *sp;
2104
2105         sp = page_header(__pa(spte));
2106
2107         rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
2108
2109         kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
2110         kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
2111                         KVM_PAGES_PER_HPAGE(sp->role.level));
2112 }
2113
2114 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
2115 {
2116         return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
2117 }
2118
2119 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
2120 {
2121         return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
2122 }
2123
2124 #ifdef MMU_DEBUG
2125 static int is_empty_shadow_page(u64 *spt)
2126 {
2127         u64 *pos;
2128         u64 *end;
2129
2130         for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
2131                 if (is_shadow_present_pte(*pos)) {
2132                         printk(KERN_ERR "%s: %p %llx\n", __func__,
2133                                pos, *pos);
2134                         return 0;
2135                 }
2136         return 1;
2137 }
2138 #endif
2139
2140 /*
2141  * This value is the sum of all of the kvm instances's
2142  * kvm->arch.n_used_mmu_pages values.  We need a global,
2143  * aggregate version in order to make the slab shrinker
2144  * faster
2145  */
2146 static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
2147 {
2148         kvm->arch.n_used_mmu_pages += nr;
2149         percpu_counter_add(&kvm_total_used_mmu_pages, nr);
2150 }
2151
2152 static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
2153 {
2154         MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
2155         hlist_del(&sp->hash_link);
2156         list_del(&sp->link);
2157         free_page((unsigned long)sp->spt);
2158         if (!sp->role.direct)
2159                 free_page((unsigned long)sp->gfns);
2160         kmem_cache_free(mmu_page_header_cache, sp);
2161 }
2162
2163 static unsigned kvm_page_table_hashfn(gfn_t gfn)
2164 {
2165         return hash_64(gfn, KVM_MMU_HASH_SHIFT);
2166 }
2167
2168 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
2169                                     struct kvm_mmu_page *sp, u64 *parent_pte)
2170 {
2171         if (!parent_pte)
2172                 return;
2173
2174         pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
2175 }
2176
2177 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
2178                                        u64 *parent_pte)
2179 {
2180         __pte_list_remove(parent_pte, &sp->parent_ptes);
2181 }
2182
2183 static void drop_parent_pte(struct kvm_mmu_page *sp,
2184                             u64 *parent_pte)
2185 {
2186         mmu_page_remove_parent_pte(sp, parent_pte);
2187         mmu_spte_clear_no_track(parent_pte);
2188 }
2189
2190 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
2191 {
2192         struct kvm_mmu_page *sp;
2193
2194         sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
2195         sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
2196         if (!direct)
2197                 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
2198         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
2199
2200         /*
2201          * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
2202          * depends on valid pages being added to the head of the list.  See
2203          * comments in kvm_zap_obsolete_pages().
2204          */
2205         sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
2206         list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
2207         kvm_mod_used_mmu_pages(vcpu->kvm, +1);
2208         return sp;
2209 }
2210
2211 static void mark_unsync(u64 *spte);
2212 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
2213 {
2214         u64 *sptep;
2215         struct rmap_iterator iter;
2216
2217         for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
2218                 mark_unsync(sptep);
2219         }
2220 }
2221
2222 static void mark_unsync(u64 *spte)
2223 {
2224         struct kvm_mmu_page *sp;
2225         unsigned int index;
2226
2227         sp = page_header(__pa(spte));
2228         index = spte - sp->spt;
2229         if (__test_and_set_bit(index, sp->unsync_child_bitmap))
2230                 return;
2231         if (sp->unsync_children++)
2232                 return;
2233         kvm_mmu_mark_parents_unsync(sp);
2234 }
2235
2236 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
2237                                struct kvm_mmu_page *sp)
2238 {
2239         return 0;
2240 }
2241
2242 static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root)
2243 {
2244 }
2245
2246 #define KVM_PAGE_ARRAY_NR 16
2247
2248 struct kvm_mmu_pages {
2249         struct mmu_page_and_offset {
2250                 struct kvm_mmu_page *sp;
2251                 unsigned int idx;
2252         } page[KVM_PAGE_ARRAY_NR];
2253         unsigned int nr;
2254 };
2255
2256 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
2257                          int idx)
2258 {
2259         int i;
2260
2261         if (sp->unsync)
2262                 for (i=0; i < pvec->nr; i++)
2263                         if (pvec->page[i].sp == sp)
2264                                 return 0;
2265
2266         pvec->page[pvec->nr].sp = sp;
2267         pvec->page[pvec->nr].idx = idx;
2268         pvec->nr++;
2269         return (pvec->nr == KVM_PAGE_ARRAY_NR);
2270 }
2271
2272 static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
2273 {
2274         --sp->unsync_children;
2275         WARN_ON((int)sp->unsync_children < 0);
2276         __clear_bit(idx, sp->unsync_child_bitmap);
2277 }
2278
2279 static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
2280                            struct kvm_mmu_pages *pvec)
2281 {
2282         int i, ret, nr_unsync_leaf = 0;
2283
2284         for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
2285                 struct kvm_mmu_page *child;
2286                 u64 ent = sp->spt[i];
2287
2288                 if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
2289                         clear_unsync_child_bit(sp, i);
2290                         continue;
2291                 }
2292
2293                 child = page_header(ent & PT64_BASE_ADDR_MASK);
2294
2295                 if (child->unsync_children) {
2296                         if (mmu_pages_add(pvec, child, i))
2297                                 return -ENOSPC;
2298
2299                         ret = __mmu_unsync_walk(child, pvec);
2300                         if (!ret) {
2301                                 clear_unsync_child_bit(sp, i);
2302                                 continue;
2303                         } else if (ret > 0) {
2304                                 nr_unsync_leaf += ret;
2305                         } else
2306                                 return ret;
2307                 } else if (child->unsync) {
2308                         nr_unsync_leaf++;
2309                         if (mmu_pages_add(pvec, child, i))
2310                                 return -ENOSPC;
2311                 } else
2312                         clear_unsync_child_bit(sp, i);
2313         }
2314
2315         return nr_unsync_leaf;
2316 }
2317
2318 #define INVALID_INDEX (-1)
2319
2320 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
2321                            struct kvm_mmu_pages *pvec)
2322 {
2323         pvec->nr = 0;
2324         if (!sp->unsync_children)
2325                 return 0;
2326
2327         mmu_pages_add(pvec, sp, INVALID_INDEX);
2328         return __mmu_unsync_walk(sp, pvec);
2329 }
2330
2331 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
2332 {
2333         WARN_ON(!sp->unsync);
2334         trace_kvm_mmu_sync_page(sp);
2335         sp->unsync = 0;
2336         --kvm->stat.mmu_unsync;
2337 }
2338
2339 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2340                                      struct list_head *invalid_list);
2341 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2342                                     struct list_head *invalid_list);
2343
2344
2345 #define for_each_valid_sp(_kvm, _sp, _gfn)                              \
2346         hlist_for_each_entry(_sp,                                       \
2347           &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
2348                 if (is_obsolete_sp((_kvm), (_sp))) {                    \
2349                 } else
2350
2351 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)                 \
2352         for_each_valid_sp(_kvm, _sp, _gfn)                              \
2353                 if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
2354
2355 static inline bool is_ept_sp(struct kvm_mmu_page *sp)
2356 {
2357         return sp->role.cr0_wp && sp->role.smap_andnot_wp;
2358 }
2359
2360 /* @sp->gfn should be write-protected at the call site */
2361 static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2362                             struct list_head *invalid_list)
2363 {
2364         if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) ||
2365             vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
2366                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
2367                 return false;
2368         }
2369
2370         return true;
2371 }
2372
2373 static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
2374                                         struct list_head *invalid_list,
2375                                         bool remote_flush)
2376 {
2377         if (!remote_flush && list_empty(invalid_list))
2378                 return false;
2379
2380         if (!list_empty(invalid_list))
2381                 kvm_mmu_commit_zap_page(kvm, invalid_list);
2382         else
2383                 kvm_flush_remote_tlbs(kvm);
2384         return true;
2385 }
2386
2387 static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
2388                                  struct list_head *invalid_list,
2389                                  bool remote_flush, bool local_flush)
2390 {
2391         if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush))
2392                 return;
2393
2394         if (local_flush)
2395                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2396 }
2397
2398 #ifdef CONFIG_KVM_MMU_AUDIT
2399 #include "mmu_audit.c"
2400 #else
2401 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
2402 static void mmu_audit_disable(void) { }
2403 #endif
2404
2405 static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
2406 {
2407         return sp->role.invalid ||
2408                unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
2409 }
2410
2411 static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2412                          struct list_head *invalid_list)
2413 {
2414         kvm_unlink_unsync_page(vcpu->kvm, sp);
2415         return __kvm_sync_page(vcpu, sp, invalid_list);
2416 }
2417
2418 /* @gfn should be write-protected at the call site */
2419 static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
2420                            struct list_head *invalid_list)
2421 {
2422         struct kvm_mmu_page *s;
2423         bool ret = false;
2424
2425         for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
2426                 if (!s->unsync)
2427                         continue;
2428
2429                 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
2430                 ret |= kvm_sync_page(vcpu, s, invalid_list);
2431         }
2432
2433         return ret;
2434 }
2435
2436 struct mmu_page_path {
2437         struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
2438         unsigned int idx[PT64_ROOT_MAX_LEVEL];
2439 };
2440
2441 #define for_each_sp(pvec, sp, parents, i)                       \
2442                 for (i = mmu_pages_first(&pvec, &parents);      \
2443                         i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});   \
2444                         i = mmu_pages_next(&pvec, &parents, i))
2445
2446 static int mmu_pages_next(struct kvm_mmu_pages *pvec,
2447                           struct mmu_page_path *parents,
2448                           int i)
2449 {
2450         int n;
2451
2452         for (n = i+1; n < pvec->nr; n++) {
2453                 struct kvm_mmu_page *sp = pvec->page[n].sp;
2454                 unsigned idx = pvec->page[n].idx;
2455                 int level = sp->role.level;
2456
2457                 parents->idx[level-1] = idx;
2458                 if (level == PT_PAGE_TABLE_LEVEL)
2459                         break;
2460
2461                 parents->parent[level-2] = sp;
2462         }
2463
2464         return n;
2465 }
2466
2467 static int mmu_pages_first(struct kvm_mmu_pages *pvec,
2468                            struct mmu_page_path *parents)
2469 {
2470         struct kvm_mmu_page *sp;
2471         int level;
2472
2473         if (pvec->nr == 0)
2474                 return 0;
2475
2476         WARN_ON(pvec->page[0].idx != INVALID_INDEX);
2477
2478         sp = pvec->page[0].sp;
2479         level = sp->role.level;
2480         WARN_ON(level == PT_PAGE_TABLE_LEVEL);
2481
2482         parents->parent[level-2] = sp;
2483
2484         /* Also set up a sentinel.  Further entries in pvec are all
2485          * children of sp, so this element is never overwritten.
2486          */
2487         parents->parent[level-1] = NULL;
2488         return mmu_pages_next(pvec, parents, 0);
2489 }
2490
2491 static void mmu_pages_clear_parents(struct mmu_page_path *parents)
2492 {
2493         struct kvm_mmu_page *sp;
2494         unsigned int level = 0;
2495
2496         do {
2497                 unsigned int idx = parents->idx[level];
2498                 sp = parents->parent[level];
2499                 if (!sp)
2500                         return;
2501
2502                 WARN_ON(idx == INVALID_INDEX);
2503                 clear_unsync_child_bit(sp, idx);
2504                 level++;
2505         } while (!sp->unsync_children);
2506 }
2507
2508 static void mmu_sync_children(struct kvm_vcpu *vcpu,
2509                               struct kvm_mmu_page *parent)
2510 {
2511         int i;
2512         struct kvm_mmu_page *sp;
2513         struct mmu_page_path parents;
2514         struct kvm_mmu_pages pages;
2515         LIST_HEAD(invalid_list);
2516         bool flush = false;
2517
2518         while (mmu_unsync_walk(parent, &pages)) {
2519                 bool protected = false;
2520
2521                 for_each_sp(pages, sp, parents, i)
2522                         protected |= rmap_write_protect(vcpu, sp->gfn);
2523
2524                 if (protected) {
2525                         kvm_flush_remote_tlbs(vcpu->kvm);
2526                         flush = false;
2527                 }
2528
2529                 for_each_sp(pages, sp, parents, i) {
2530                         flush |= kvm_sync_page(vcpu, sp, &invalid_list);
2531                         mmu_pages_clear_parents(&parents);
2532                 }
2533                 if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
2534                         kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2535                         cond_resched_lock(&vcpu->kvm->mmu_lock);
2536                         flush = false;
2537                 }
2538         }
2539
2540         kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2541 }
2542
2543 static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
2544 {
2545         atomic_set(&sp->write_flooding_count,  0);
2546 }
2547
2548 static void clear_sp_write_flooding_count(u64 *spte)
2549 {
2550         struct kvm_mmu_page *sp =  page_header(__pa(spte));
2551
2552         __clear_sp_write_flooding_count(sp);
2553 }
2554
2555 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
2556                                              gfn_t gfn,
2557                                              gva_t gaddr,
2558                                              unsigned level,
2559                                              int direct,
2560                                              unsigned access)
2561 {
2562         union kvm_mmu_page_role role;
2563         unsigned quadrant;
2564         struct kvm_mmu_page *sp;
2565         bool need_sync = false;
2566         bool flush = false;
2567         int collisions = 0;
2568         LIST_HEAD(invalid_list);
2569
2570         role = vcpu->arch.mmu->mmu_role.base;
2571         role.level = level;
2572         role.direct = direct;
2573         if (role.direct)
2574                 role.gpte_is_8_bytes = true;
2575         role.access = access;
2576         if (!vcpu->arch.mmu->direct_map
2577             && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
2578                 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
2579                 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
2580                 role.quadrant = quadrant;
2581         }
2582         for_each_valid_sp(vcpu->kvm, sp, gfn) {
2583                 if (sp->gfn != gfn) {
2584                         collisions++;
2585                         continue;
2586                 }
2587
2588                 if (!need_sync && sp->unsync)
2589                         need_sync = true;
2590
2591                 if (sp->role.word != role.word)
2592                         continue;
2593
2594                 if (sp->unsync) {
2595                         /* The page is good, but __kvm_sync_page might still end
2596                          * up zapping it.  If so, break in order to rebuild it.
2597                          */
2598                         if (!__kvm_sync_page(vcpu, sp, &invalid_list))
2599                                 break;
2600
2601                         WARN_ON(!list_empty(&invalid_list));
2602                         kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2603                 }
2604
2605                 if (sp->unsync_children)
2606                         kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
2607
2608                 __clear_sp_write_flooding_count(sp);
2609                 trace_kvm_mmu_get_page(sp, false);
2610                 goto out;
2611         }
2612
2613         ++vcpu->kvm->stat.mmu_cache_miss;
2614
2615         sp = kvm_mmu_alloc_page(vcpu, direct);
2616
2617         sp->gfn = gfn;
2618         sp->role = role;
2619         hlist_add_head(&sp->hash_link,
2620                 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
2621         if (!direct) {
2622                 /*
2623                  * we should do write protection before syncing pages
2624                  * otherwise the content of the synced shadow page may
2625                  * be inconsistent with guest page table.
2626                  */
2627                 account_shadowed(vcpu->kvm, sp);
2628                 if (level == PT_PAGE_TABLE_LEVEL &&
2629                       rmap_write_protect(vcpu, gfn))
2630                         kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
2631
2632                 if (level > PT_PAGE_TABLE_LEVEL && need_sync)
2633                         flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
2634         }
2635         clear_page(sp->spt);
2636         trace_kvm_mmu_get_page(sp, true);
2637
2638         kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2639 out:
2640         if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
2641                 vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
2642         return sp;
2643 }
2644
2645 static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
2646                                         struct kvm_vcpu *vcpu, hpa_t root,
2647                                         u64 addr)
2648 {
2649         iterator->addr = addr;
2650         iterator->shadow_addr = root;
2651         iterator->level = vcpu->arch.mmu->shadow_root_level;
2652
2653         if (iterator->level == PT64_ROOT_4LEVEL &&
2654             vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
2655             !vcpu->arch.mmu->direct_map)
2656                 --iterator->level;
2657
2658         if (iterator->level == PT32E_ROOT_LEVEL) {
2659                 /*
2660                  * prev_root is currently only used for 64-bit hosts. So only
2661                  * the active root_hpa is valid here.
2662                  */
2663                 BUG_ON(root != vcpu->arch.mmu->root_hpa);
2664
2665                 iterator->shadow_addr
2666                         = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
2667                 iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
2668                 --iterator->level;
2669                 if (!iterator->shadow_addr)
2670                         iterator->level = 0;
2671         }
2672 }
2673
2674 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
2675                              struct kvm_vcpu *vcpu, u64 addr)
2676 {
2677         shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa,
2678                                     addr);
2679 }
2680
2681 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
2682 {
2683         if (iterator->level < PT_PAGE_TABLE_LEVEL)
2684                 return false;
2685
2686         iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
2687         iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
2688         return true;
2689 }
2690
2691 static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
2692                                u64 spte)
2693 {
2694         if (is_last_spte(spte, iterator->level)) {
2695                 iterator->level = 0;
2696                 return;
2697         }
2698
2699         iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
2700         --iterator->level;
2701 }
2702
2703 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2704 {
2705         __shadow_walk_next(iterator, *iterator->sptep);
2706 }
2707
2708 static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
2709                              struct kvm_mmu_page *sp)
2710 {
2711         u64 spte;
2712
2713         BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2714
2715         spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
2716                shadow_user_mask | shadow_x_mask | shadow_me_mask;
2717
2718         if (sp_ad_disabled(sp))
2719                 spte |= SPTE_AD_DISABLED_MASK;
2720         else
2721                 spte |= shadow_accessed_mask;
2722
2723         mmu_spte_set(sptep, spte);
2724
2725         mmu_page_add_parent_pte(vcpu, sp, sptep);
2726
2727         if (sp->unsync_children || sp->unsync)
2728                 mark_unsync(sptep);
2729 }
2730
2731 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2732                                    unsigned direct_access)
2733 {
2734         if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
2735                 struct kvm_mmu_page *child;
2736
2737                 /*
2738                  * For the direct sp, if the guest pte's dirty bit
2739                  * changed form clean to dirty, it will corrupt the
2740                  * sp's access: allow writable in the read-only sp,
2741                  * so we should update the spte at this point to get
2742                  * a new sp with the correct access.
2743                  */
2744                 child = page_header(*sptep & PT64_BASE_ADDR_MASK);
2745                 if (child->role.access == direct_access)
2746                         return;
2747
2748                 drop_parent_pte(child, sptep);
2749                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);
2750         }
2751 }
2752
2753 static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
2754                              u64 *spte)
2755 {
2756         u64 pte;
2757         struct kvm_mmu_page *child;
2758
2759         pte = *spte;
2760         if (is_shadow_present_pte(pte)) {
2761                 if (is_last_spte(pte, sp->role.level)) {
2762                         drop_spte(kvm, spte);
2763                         if (is_large_pte(pte))
2764                                 --kvm->stat.lpages;
2765                 } else {
2766                         child = page_header(pte & PT64_BASE_ADDR_MASK);
2767                         drop_parent_pte(child, spte);
2768                 }
2769                 return true;
2770         }
2771
2772         if (is_mmio_spte(pte))
2773                 mmu_spte_clear_no_track(spte);
2774
2775         return false;
2776 }
2777
2778 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
2779                                          struct kvm_mmu_page *sp)
2780 {
2781         unsigned i;
2782
2783         for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2784                 mmu_page_zap_pte(kvm, sp, sp->spt + i);
2785 }
2786
2787 static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
2788 {
2789         u64 *sptep;
2790         struct rmap_iterator iter;
2791
2792         while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
2793                 drop_parent_pte(sp, sptep);
2794 }
2795
2796 static int mmu_zap_unsync_children(struct kvm *kvm,
2797                                    struct kvm_mmu_page *parent,
2798                                    struct list_head *invalid_list)
2799 {
2800         int i, zapped = 0;
2801         struct mmu_page_path parents;
2802         struct kvm_mmu_pages pages;
2803
2804         if (parent->role.level == PT_PAGE_TABLE_LEVEL)
2805                 return 0;
2806
2807         while (mmu_unsync_walk(parent, &pages)) {
2808                 struct kvm_mmu_page *sp;
2809
2810                 for_each_sp(pages, sp, parents, i) {
2811                         kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2812                         mmu_pages_clear_parents(&parents);
2813                         zapped++;
2814                 }
2815         }
2816
2817         return zapped;
2818 }
2819
2820 static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
2821                                        struct kvm_mmu_page *sp,
2822                                        struct list_head *invalid_list,
2823                                        int *nr_zapped)
2824 {
2825         bool list_unstable;
2826
2827         trace_kvm_mmu_prepare_zap_page(sp);
2828         ++kvm->stat.mmu_shadow_zapped;
2829         *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
2830         kvm_mmu_page_unlink_children(kvm, sp);
2831         kvm_mmu_unlink_parents(kvm, sp);
2832
2833         /* Zapping children means active_mmu_pages has become unstable. */
2834         list_unstable = *nr_zapped;
2835
2836         if (!sp->role.invalid && !sp->role.direct)
2837                 unaccount_shadowed(kvm, sp);
2838
2839         if (sp->unsync)
2840                 kvm_unlink_unsync_page(kvm, sp);
2841         if (!sp->root_count) {
2842                 /* Count self */
2843                 (*nr_zapped)++;
2844                 list_move(&sp->link, invalid_list);
2845                 kvm_mod_used_mmu_pages(kvm, -1);
2846         } else {
2847                 list_move(&sp->link, &kvm->arch.active_mmu_pages);
2848
2849                 /*
2850                  * Obsolete pages cannot be used on any vCPUs, see the comment
2851                  * in kvm_mmu_zap_all_fast().  Note, is_obsolete_sp() also
2852                  * treats invalid shadow pages as being obsolete.
2853                  */
2854                 if (!is_obsolete_sp(kvm, sp))
2855                         kvm_reload_remote_mmus(kvm);
2856         }
2857
2858         if (sp->lpage_disallowed)
2859                 unaccount_huge_nx_page(kvm, sp);
2860
2861         sp->role.invalid = 1;
2862         return list_unstable;
2863 }
2864
2865 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2866                                      struct list_head *invalid_list)
2867 {
2868         int nr_zapped;
2869
2870         __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
2871         return nr_zapped;
2872 }
2873
2874 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2875                                     struct list_head *invalid_list)
2876 {
2877         struct kvm_mmu_page *sp, *nsp;
2878
2879         if (list_empty(invalid_list))
2880                 return;
2881
2882         /*
2883          * We need to make sure everyone sees our modifications to
2884          * the page tables and see changes to vcpu->mode here. The barrier
2885          * in the kvm_flush_remote_tlbs() achieves this. This pairs
2886          * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
2887          *
2888          * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
2889          * guest mode and/or lockless shadow page table walks.
2890          */
2891         kvm_flush_remote_tlbs(kvm);
2892
2893         list_for_each_entry_safe(sp, nsp, invalid_list, link) {
2894                 WARN_ON(!sp->role.invalid || sp->root_count);
2895                 kvm_mmu_free_page(sp);
2896         }
2897 }
2898
2899 static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
2900                                         struct list_head *invalid_list)
2901 {
2902         struct kvm_mmu_page *sp;
2903
2904         if (list_empty(&kvm->arch.active_mmu_pages))
2905                 return false;
2906
2907         sp = list_last_entry(&kvm->arch.active_mmu_pages,
2908                              struct kvm_mmu_page, link);
2909         return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2910 }
2911
2912 /*
2913  * Changing the number of mmu pages allocated to the vm
2914  * Note: if goal_nr_mmu_pages is too small, you will get dead lock
2915  */
2916 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
2917 {
2918         LIST_HEAD(invalid_list);
2919
2920         spin_lock(&kvm->mmu_lock);
2921
2922         if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2923                 /* Need to free some mmu pages to achieve the goal. */
2924                 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
2925                         if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
2926                                 break;
2927
2928                 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2929                 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
2930         }
2931
2932         kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
2933
2934         spin_unlock(&kvm->mmu_lock);
2935 }
2936
2937 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2938 {
2939         struct kvm_mmu_page *sp;
2940         LIST_HEAD(invalid_list);
2941         int r;
2942
2943         pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
2944         r = 0;
2945         spin_lock(&kvm->mmu_lock);
2946         for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
2947                 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
2948                          sp->role.word);
2949                 r = 1;
2950                 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
2951         }
2952         kvm_mmu_commit_zap_page(kvm, &invalid_list);
2953         spin_unlock(&kvm->mmu_lock);
2954
2955         return r;
2956 }
2957 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
2958
2959 static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
2960 {
2961         trace_kvm_mmu_unsync_page(sp);
2962         ++vcpu->kvm->stat.mmu_unsync;
2963         sp->unsync = 1;
2964
2965         kvm_mmu_mark_parents_unsync(sp);
2966 }
2967
2968 static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2969                                    bool can_unsync)
2970 {
2971         struct kvm_mmu_page *sp;
2972
2973         if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
2974                 return true;
2975
2976         for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
2977                 if (!can_unsync)
2978                         return true;
2979
2980                 if (sp->unsync)
2981                         continue;
2982
2983                 WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
2984                 kvm_unsync_page(vcpu, sp);
2985         }
2986
2987         /*
2988          * We need to ensure that the marking of unsync pages is visible
2989          * before the SPTE is updated to allow writes because
2990          * kvm_mmu_sync_roots() checks the unsync flags without holding
2991          * the MMU lock and so can race with this. If the SPTE was updated
2992          * before the page had been marked as unsync-ed, something like the
2993          * following could happen:
2994          *
2995          * CPU 1                    CPU 2
2996          * ---------------------------------------------------------------------
2997          * 1.2 Host updates SPTE
2998          *     to be writable
2999          *                      2.1 Guest writes a GPTE for GVA X.
3000          *                          (GPTE being in the guest page table shadowed
3001          *                           by the SP from CPU 1.)
3002          *                          This reads SPTE during the page table walk.
3003          *                          Since SPTE.W is read as 1, there is no
3004          *                          fault.
3005          *
3006          *                      2.2 Guest issues TLB flush.
3007          *                          That causes a VM Exit.
3008          *
3009          *                      2.3 kvm_mmu_sync_pages() reads sp->unsync.
3010          *                          Since it is false, so it just returns.
3011          *
3012          *                      2.4 Guest accesses GVA X.
3013          *                          Since the mapping in the SP was not updated,
3014          *                          so the old mapping for GVA X incorrectly
3015          *                          gets used.
3016          * 1.1 Host marks SP
3017          *     as unsync
3018          *     (sp->unsync = true)
3019          *
3020          * The write barrier below ensures that 1.1 happens before 1.2 and thus
3021          * the situation in 2.4 does not arise. The implicit barrier in 2.2
3022          * pairs with this write barrier.
3023          */
3024         smp_wmb();
3025
3026         return false;
3027 }
3028
3029 static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
3030 {
3031         if (pfn_valid(pfn))
3032                 return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
3033                         /*
3034                          * Some reserved pages, such as those from NVDIMM
3035                          * DAX devices, are not for MMIO, and can be mapped
3036                          * with cached memory type for better performance.
3037                          * However, the above check misconceives those pages
3038                          * as MMIO, and results in KVM mapping them with UC
3039                          * memory type, which would hurt the performance.
3040                          * Therefore, we check the host memory type in addition
3041                          * and only treat UC/UC-/WC pages as MMIO.
3042                          */
3043                         (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
3044
3045         return !e820__mapped_raw_any(pfn_to_hpa(pfn),
3046                                      pfn_to_hpa(pfn + 1) - 1,
3047                                      E820_TYPE_RAM);
3048 }
3049
3050 /* Bits which may be returned by set_spte() */
3051 #define SET_SPTE_WRITE_PROTECTED_PT     BIT(0)
3052 #define SET_SPTE_NEED_REMOTE_TLB_FLUSH  BIT(1)
3053
3054 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
3055                     unsigned pte_access, int level,
3056                     gfn_t gfn, kvm_pfn_t pfn, bool speculative,
3057                     bool can_unsync, bool host_writable)
3058 {
3059         u64 spte = 0;
3060         int ret = 0;
3061         struct kvm_mmu_page *sp;
3062
3063         if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
3064                 return 0;
3065
3066         sp = page_header(__pa(sptep));
3067         if (sp_ad_disabled(sp))
3068                 spte |= SPTE_AD_DISABLED_MASK;
3069         else if (kvm_vcpu_ad_need_write_protect(vcpu))
3070                 spte |= SPTE_AD_WRPROT_ONLY_MASK;
3071
3072         /*
3073          * For the EPT case, shadow_present_mask is 0 if hardware
3074          * supports exec-only page table entries.  In that case,
3075          * ACC_USER_MASK and shadow_user_mask are used to represent
3076          * read access.  See FNAME(gpte_access) in paging_tmpl.h.
3077          */
3078         spte |= shadow_present_mask;
3079         if (!speculative)
3080                 spte |= spte_shadow_accessed_mask(spte);
3081
3082         if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&
3083             is_nx_huge_page_enabled()) {
3084                 pte_access &= ~ACC_EXEC_MASK;
3085         }
3086
3087         if (pte_access & ACC_EXEC_MASK)
3088                 spte |= shadow_x_mask;
3089         else
3090                 spte |= shadow_nx_mask;
3091
3092         if (pte_access & ACC_USER_MASK)
3093                 spte |= shadow_user_mask;
3094
3095         if (level > PT_PAGE_TABLE_LEVEL)
3096                 spte |= PT_PAGE_SIZE_MASK;
3097         if (tdp_enabled)
3098                 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
3099                         kvm_is_mmio_pfn(pfn));
3100
3101         if (host_writable)
3102                 spte |= SPTE_HOST_WRITEABLE;
3103         else
3104                 pte_access &= ~ACC_WRITE_MASK;
3105
3106         if (!kvm_is_mmio_pfn(pfn))
3107                 spte |= shadow_me_mask;
3108
3109         spte |= (u64)pfn << PAGE_SHIFT;
3110
3111         if (pte_access & ACC_WRITE_MASK) {
3112
3113                 /*
3114                  * Other vcpu creates new sp in the window between
3115                  * mapping_level() and acquiring mmu-lock. We can
3116                  * allow guest to retry the access, the mapping can
3117                  * be fixed if guest refault.
3118                  */
3119                 if (level > PT_PAGE_TABLE_LEVEL &&
3120                     mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
3121                         goto done;
3122
3123                 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
3124
3125                 /*
3126                  * Optimization: for pte sync, if spte was writable the hash
3127                  * lookup is unnecessary (and expensive). Write protection
3128                  * is responsibility of mmu_get_page / kvm_sync_page.
3129                  * Same reasoning can be applied to dirty page accounting.
3130                  */
3131                 if (!can_unsync && is_writable_pte(*sptep))
3132                         goto set_pte;
3133
3134                 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
3135                         pgprintk("%s: found shadow page for %llx, marking ro\n",
3136                                  __func__, gfn);
3137                         ret |= SET_SPTE_WRITE_PROTECTED_PT;
3138                         pte_access &= ~ACC_WRITE_MASK;
3139                         spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
3140                 }
3141         }
3142
3143         if (pte_access & ACC_WRITE_MASK) {
3144                 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3145                 spte |= spte_shadow_dirty_mask(spte);
3146         }
3147
3148         if (speculative)
3149                 spte = mark_spte_for_access_track(spte);
3150
3151 set_pte:
3152         if (mmu_spte_update(sptep, spte))
3153                 ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
3154 done:
3155         return ret;
3156 }
3157
3158 static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
3159                         int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
3160                         bool speculative, bool host_writable)
3161 {
3162         int was_rmapped = 0;
3163         int rmap_count;
3164         int set_spte_ret;
3165         int ret = RET_PF_RETRY;
3166         bool flush = false;
3167
3168         pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
3169                  *sptep, write_fault, gfn);
3170
3171         if (is_shadow_present_pte(*sptep)) {
3172                 /*
3173                  * If we overwrite a PTE page pointer with a 2MB PMD, unlink
3174                  * the parent of the now unreachable PTE.
3175                  */
3176                 if (level > PT_PAGE_TABLE_LEVEL &&
3177                     !is_large_pte(*sptep)) {
3178                         struct kvm_mmu_page *child;
3179                         u64 pte = *sptep;
3180
3181                         child = page_header(pte & PT64_BASE_ADDR_MASK);
3182                         drop_parent_pte(child, sptep);
3183                         flush = true;
3184                 } else if (pfn != spte_to_pfn(*sptep)) {
3185                         pgprintk("hfn old %llx new %llx\n",
3186                                  spte_to_pfn(*sptep), pfn);
3187                         drop_spte(vcpu->kvm, sptep);
3188                         flush = true;
3189                 } else
3190                         was_rmapped = 1;
3191         }
3192
3193         set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
3194                                 speculative, true, host_writable);
3195         if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
3196                 if (write_fault)
3197                         ret = RET_PF_EMULATE;
3198                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3199         }
3200
3201         if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
3202                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
3203                                 KVM_PAGES_PER_HPAGE(level));
3204
3205         if (unlikely(is_mmio_spte(*sptep)))
3206                 ret = RET_PF_EMULATE;
3207
3208         pgprintk("%s: setting spte %llx\n", __func__, *sptep);
3209         trace_kvm_mmu_set_spte(level, gfn, sptep);
3210         if (!was_rmapped && is_large_pte(*sptep))
3211                 ++vcpu->kvm->stat.lpages;
3212
3213         if (is_shadow_present_pte(*sptep)) {
3214                 if (!was_rmapped) {
3215                         rmap_count = rmap_add(vcpu, sptep, gfn);
3216                         if (rmap_count > RMAP_RECYCLE_THRESHOLD)
3217                                 rmap_recycle(vcpu, sptep, gfn);
3218                 }
3219         }
3220
3221         return ret;
3222 }
3223
3224 static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
3225                                      bool no_dirty_log)
3226 {
3227         struct kvm_memory_slot *slot;
3228
3229         slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
3230         if (!slot)
3231                 return KVM_PFN_ERR_FAULT;
3232
3233         return gfn_to_pfn_memslot_atomic(slot, gfn);
3234 }
3235
3236 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
3237                                     struct kvm_mmu_page *sp,
3238                                     u64 *start, u64 *end)
3239 {
3240         struct page *pages[PTE_PREFETCH_NUM];
3241         struct kvm_memory_slot *slot;
3242         unsigned access = sp->role.access;
3243         int i, ret;
3244         gfn_t gfn;
3245
3246         gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
3247         slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
3248         if (!slot)
3249                 return -1;
3250
3251         ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
3252         if (ret <= 0)
3253                 return -1;
3254
3255         for (i = 0; i < ret; i++, gfn++, start++) {
3256                 mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
3257                              page_to_pfn(pages[i]), true, true);
3258                 put_page(pages[i]);
3259         }
3260
3261         return 0;
3262 }
3263
3264 static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
3265                                   struct kvm_mmu_page *sp, u64 *sptep)
3266 {
3267         u64 *spte, *start = NULL;
3268         int i;
3269
3270         WARN_ON(!sp->role.direct);
3271
3272         i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
3273         spte = sp->spt + i;
3274
3275         for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
3276                 if (is_shadow_present_pte(*spte) || spte == sptep) {
3277                         if (!start)
3278                                 continue;
3279                         if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
3280                                 break;
3281                         start = NULL;
3282                 } else if (!start)
3283                         start = spte;
3284         }
3285 }
3286
3287 static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
3288 {
3289         struct kvm_mmu_page *sp;
3290
3291         sp = page_header(__pa(sptep));
3292
3293         /*
3294          * Without accessed bits, there's no way to distinguish between
3295          * actually accessed translations and prefetched, so disable pte
3296          * prefetch if accessed bits aren't available.
3297          */
3298         if (sp_ad_disabled(sp))
3299                 return;
3300
3301         if (sp->role.level > PT_PAGE_TABLE_LEVEL)
3302                 return;
3303
3304         __direct_pte_prefetch(vcpu, sp, sptep);
3305 }
3306
3307 static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
3308                                        gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)
3309 {
3310         int level = *levelp;
3311         u64 spte = *it.sptep;
3312
3313         if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&
3314             is_nx_huge_page_enabled() &&
3315             is_shadow_present_pte(spte) &&
3316             !is_large_pte(spte)) {
3317                 /*
3318                  * A small SPTE exists for this pfn, but FNAME(fetch)
3319                  * and __direct_map would like to create a large PTE
3320                  * instead: just force them to go down another level,
3321                  * patching back for them into pfn the next 9 bits of
3322                  * the address.
3323                  */
3324                 u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);
3325                 *pfnp |= gfn & page_mask;
3326                 (*levelp)--;
3327         }
3328 }
3329
3330 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
3331                         int map_writable, int level, kvm_pfn_t pfn,
3332                         bool prefault, bool lpage_disallowed)
3333 {
3334         struct kvm_shadow_walk_iterator it;
3335         struct kvm_mmu_page *sp;
3336         int ret;
3337         gfn_t gfn = gpa >> PAGE_SHIFT;
3338         gfn_t base_gfn = gfn;
3339
3340         if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3341                 return RET_PF_RETRY;
3342
3343         trace_kvm_mmu_spte_requested(gpa, level, pfn);
3344         for_each_shadow_entry(vcpu, gpa, it) {
3345                 /*
3346                  * We cannot overwrite existing page tables with an NX
3347                  * large page, as the leaf could be executable.
3348                  */
3349                 disallowed_hugepage_adjust(it, gfn, &pfn, &level);
3350
3351                 base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
3352                 if (it.level == level)
3353                         break;
3354
3355                 drop_large_spte(vcpu, it.sptep);
3356                 if (!is_shadow_present_pte(*it.sptep)) {
3357                         sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
3358                                               it.level - 1, true, ACC_ALL);
3359
3360                         link_shadow_page(vcpu, it.sptep, sp);
3361                         if (lpage_disallowed)
3362                                 account_huge_nx_page(vcpu->kvm, sp);
3363                 }
3364         }
3365
3366         ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
3367                            write, level, base_gfn, pfn, prefault,
3368                            map_writable);
3369         direct_pte_prefetch(vcpu, it.sptep);
3370         ++vcpu->stat.pf_fixed;
3371         return ret;
3372 }
3373
3374 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
3375 {
3376         send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
3377 }
3378
3379 static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
3380 {
3381         /*
3382          * Do not cache the mmio info caused by writing the readonly gfn
3383          * into the spte otherwise read access on readonly gfn also can
3384          * caused mmio page fault and treat it as mmio access.
3385          */
3386         if (pfn == KVM_PFN_ERR_RO_FAULT)
3387                 return RET_PF_EMULATE;
3388
3389         if (pfn == KVM_PFN_ERR_HWPOISON) {
3390                 kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
3391                 return RET_PF_RETRY;
3392         }
3393
3394         return -EFAULT;
3395 }
3396
3397 static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
3398                                         gfn_t gfn, kvm_pfn_t *pfnp,
3399                                         int *levelp)
3400 {
3401         kvm_pfn_t pfn = *pfnp;
3402         int level = *levelp;
3403
3404         /*
3405          * Check if it's a transparent hugepage. If this would be an
3406          * hugetlbfs page, level wouldn't be set to
3407          * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
3408          * here.
3409          */
3410         if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
3411             !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL &&
3412             PageTransCompoundMap(pfn_to_page(pfn)) &&
3413             !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
3414                 unsigned long mask;
3415                 /*
3416                  * mmu_notifier_retry was successful and we hold the
3417                  * mmu_lock here, so the pmd can't become splitting
3418                  * from under us, and in turn
3419                  * __split_huge_page_refcount() can't run from under
3420                  * us and we can safely transfer the refcount from
3421                  * PG_tail to PG_head as we switch the pfn to tail to
3422                  * head.
3423                  */
3424                 *levelp = level = PT_DIRECTORY_LEVEL;
3425                 mask = KVM_PAGES_PER_HPAGE(level) - 1;
3426                 VM_BUG_ON((gfn & mask) != (pfn & mask));
3427                 if (pfn & mask) {
3428                         kvm_release_pfn_clean(pfn);
3429                         pfn &= ~mask;
3430                         kvm_get_pfn(pfn);
3431                         *pfnp = pfn;
3432                 }
3433         }
3434 }
3435
3436 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
3437                                 kvm_pfn_t pfn, unsigned access, int *ret_val)
3438 {
3439         /* The pfn is invalid, report the error! */
3440         if (unlikely(is_error_pfn(pfn))) {
3441                 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
3442                 return true;
3443         }
3444
3445         if (unlikely(is_noslot_pfn(pfn)))
3446                 vcpu_cache_mmio_info(vcpu, gva, gfn,
3447                                      access & shadow_mmio_access_mask);
3448
3449         return false;
3450 }
3451
3452 static bool page_fault_can_be_fast(u32 error_code)
3453 {
3454         /*
3455          * Do not fix the mmio spte with invalid generation number which
3456          * need to be updated by slow page fault path.
3457          */
3458         if (unlikely(error_code & PFERR_RSVD_MASK))
3459                 return false;
3460
3461         /* See if the page fault is due to an NX violation */
3462         if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
3463                       == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
3464                 return false;
3465
3466         /*
3467          * #PF can be fast if:
3468          * 1. The shadow page table entry is not present, which could mean that
3469          *    the fault is potentially caused by access tracking (if enabled).
3470          * 2. The shadow page table entry is present and the fault
3471          *    is caused by write-protect, that means we just need change the W
3472          *    bit of the spte which can be done out of mmu-lock.
3473          *
3474          * However, if access tracking is disabled we know that a non-present
3475          * page must be a genuine page fault where we have to create a new SPTE.
3476          * So, if access tracking is disabled, we return true only for write
3477          * accesses to a present page.
3478          */
3479
3480         return shadow_acc_track_mask != 0 ||
3481                ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
3482                 == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
3483 }
3484
3485 /*
3486  * Returns true if the SPTE was fixed successfully. Otherwise,
3487  * someone else modified the SPTE from its original value.
3488  */
3489 static bool
3490 fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
3491                         u64 *sptep, u64 old_spte, u64 new_spte)
3492 {
3493         gfn_t gfn;
3494
3495         WARN_ON(!sp->role.direct);
3496
3497         /*
3498          * Theoretically we could also set dirty bit (and flush TLB) here in
3499          * order to eliminate unnecessary PML logging. See comments in
3500          * set_spte. But fast_page_fault is very unlikely to happen with PML
3501          * enabled, so we do not do this. This might result in the same GPA
3502          * to be logged in PML buffer again when the write really happens, and
3503          * eventually to be called by mark_page_dirty twice. But it's also no
3504          * harm. This also avoids the TLB flush needed after setting dirty bit
3505          * so non-PML cases won't be impacted.
3506          *
3507          * Compare with set_spte where instead shadow_dirty_mask is set.
3508          */
3509         if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
3510                 return false;
3511
3512         if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
3513                 /*
3514                  * The gfn of direct spte is stable since it is
3515                  * calculated by sp->gfn.
3516                  */
3517                 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
3518                 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3519         }
3520
3521         return true;
3522 }
3523
3524 static bool is_access_allowed(u32 fault_err_code, u64 spte)
3525 {
3526         if (fault_err_code & PFERR_FETCH_MASK)
3527                 return is_executable_pte(spte);
3528
3529         if (fault_err_code & PFERR_WRITE_MASK)
3530                 return is_writable_pte(spte);
3531
3532         /* Fault was on Read access */
3533         return spte & PT_PRESENT_MASK;
3534 }
3535
3536 /*
3537  * Return value:
3538  * - true: let the vcpu to access on the same address again.
3539  * - false: let the real page fault path to fix it.
3540  */
3541 static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int level,
3542                             u32 error_code)
3543 {
3544         struct kvm_shadow_walk_iterator iterator;
3545         struct kvm_mmu_page *sp;
3546         bool fault_handled = false;
3547         u64 spte = 0ull;
3548         uint retry_count = 0;
3549
3550         if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3551                 return false;
3552
3553         if (!page_fault_can_be_fast(error_code))
3554                 return false;
3555
3556         walk_shadow_page_lockless_begin(vcpu);
3557
3558         do {
3559                 u64 new_spte;
3560
3561                 for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte)
3562                         if (!is_shadow_present_pte(spte) ||
3563                             iterator.level < level)
3564                                 break;
3565
3566                 sp = page_header(__pa(iterator.sptep));
3567                 if (!is_last_spte(spte, sp->role.level))
3568                         break;
3569
3570                 /*
3571                  * Check whether the memory access that caused the fault would
3572                  * still cause it if it were to be performed right now. If not,
3573                  * then this is a spurious fault caused by TLB lazily flushed,
3574                  * or some other CPU has already fixed the PTE after the
3575                  * current CPU took the fault.
3576                  *
3577                  * Need not check the access of upper level table entries since
3578                  * they are always ACC_ALL.
3579                  */
3580                 if (is_access_allowed(error_code, spte)) {
3581                         fault_handled = true;
3582                         break;
3583                 }
3584
3585                 new_spte = spte;
3586
3587                 if (is_access_track_spte(spte))
3588                         new_spte = restore_acc_track_spte(new_spte);
3589
3590                 /*
3591                  * Currently, to simplify the code, write-protection can
3592                  * be removed in the fast path only if the SPTE was
3593                  * write-protected for dirty-logging or access tracking.
3594                  */
3595                 if ((error_code & PFERR_WRITE_MASK) &&
3596                     spte_can_locklessly_be_made_writable(spte))
3597                 {
3598                         new_spte |= PT_WRITABLE_MASK;
3599
3600                         /*
3601                          * Do not fix write-permission on the large spte.  Since
3602                          * we only dirty the first page into the dirty-bitmap in
3603                          * fast_pf_fix_direct_spte(), other pages are missed
3604                          * if its slot has dirty logging enabled.
3605                          *
3606                          * Instead, we let the slow page fault path create a
3607                          * normal spte to fix the access.
3608                          *
3609                          * See the comments in kvm_arch_commit_memory_region().
3610                          */
3611                         if (sp->role.level > PT_PAGE_TABLE_LEVEL)
3612                                 break;
3613                 }
3614
3615                 /* Verify that the fault can be handled in the fast path */
3616                 if (new_spte == spte ||
3617                     !is_access_allowed(error_code, new_spte))
3618                         break;
3619
3620                 /*
3621                  * Currently, fast page fault only works for direct mapping
3622                  * since the gfn is not stable for indirect shadow page. See
3623                  * Documentation/virt/kvm/locking.txt to get more detail.
3624                  */
3625                 fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
3626                                                         iterator.sptep, spte,
3627                                                         new_spte);
3628                 if (fault_handled)
3629                         break;
3630
3631                 if (++retry_count > 4) {
3632                         printk_once(KERN_WARNING
3633                                 "kvm: Fast #PF retrying more than 4 times.\n");
3634                         break;
3635                 }
3636
3637         } while (true);
3638
3639         trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep,
3640                               spte, fault_handled);
3641         walk_shadow_page_lockless_end(vcpu);
3642
3643         return fault_handled;
3644 }
3645
3646 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
3647                          gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
3648                          bool *writable);
3649 static int make_mmu_pages_available(struct kvm_vcpu *vcpu);
3650
3651 static int nonpaging_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
3652                          gfn_t gfn, bool prefault)
3653 {
3654         int r;
3655         int level;
3656         bool force_pt_level;
3657         kvm_pfn_t pfn;
3658         unsigned long mmu_seq;
3659         bool map_writable, write = error_code & PFERR_WRITE_MASK;
3660         bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
3661                                 is_nx_huge_page_enabled();
3662
3663         force_pt_level = lpage_disallowed;
3664         level = mapping_level(vcpu, gfn, &force_pt_level);
3665         if (likely(!force_pt_level)) {
3666                 /*
3667                  * This path builds a PAE pagetable - so we can map
3668                  * 2mb pages at maximum. Therefore check if the level
3669                  * is larger than that.
3670                  */
3671                 if (level > PT_DIRECTORY_LEVEL)
3672                         level = PT_DIRECTORY_LEVEL;
3673
3674                 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
3675         }
3676
3677         if (fast_page_fault(vcpu, gpa, level, error_code))
3678                 return RET_PF_RETRY;
3679
3680         mmu_seq = vcpu->kvm->mmu_notifier_seq;
3681         smp_rmb();
3682
3683         if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
3684                 return RET_PF_RETRY;
3685
3686         if (handle_abnormal_pfn(vcpu, gpa, gfn, pfn, ACC_ALL, &r))
3687                 return r;
3688
3689         r = RET_PF_RETRY;
3690         spin_lock(&vcpu->kvm->mmu_lock);
3691         if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
3692                 goto out_unlock;
3693         if (make_mmu_pages_available(vcpu) < 0)
3694                 goto out_unlock;
3695         if (likely(!force_pt_level))
3696                 transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
3697         r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
3698                          prefault, false);
3699 out_unlock:
3700         spin_unlock(&vcpu->kvm->mmu_lock);
3701         kvm_release_pfn_clean(pfn);
3702         return r;
3703 }
3704
3705 static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
3706                                struct list_head *invalid_list)
3707 {
3708         struct kvm_mmu_page *sp;
3709
3710         if (!VALID_PAGE(*root_hpa))
3711                 return;
3712
3713         sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK);
3714         --sp->root_count;
3715         if (!sp->root_count && sp->role.invalid)
3716                 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
3717
3718         *root_hpa = INVALID_PAGE;
3719 }
3720
3721 /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
3722 void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
3723                         ulong roots_to_free)
3724 {
3725         int i;
3726         LIST_HEAD(invalid_list);
3727         bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
3728
3729         BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
3730
3731         /* Before acquiring the MMU lock, see if we need to do any real work. */
3732         if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {
3733                 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3734                         if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
3735                             VALID_PAGE(mmu->prev_roots[i].hpa))
3736                                 break;
3737
3738                 if (i == KVM_MMU_NUM_PREV_ROOTS)
3739                         return;
3740         }
3741
3742         spin_lock(&vcpu->kvm->mmu_lock);
3743
3744         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3745                 if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
3746                         mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa,
3747                                            &invalid_list);
3748
3749         if (free_active_root) {
3750                 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
3751                     (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
3752                         mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
3753                                            &invalid_list);
3754                 } else {
3755                         for (i = 0; i < 4; ++i)
3756                                 if (mmu->pae_root[i] != 0)
3757                                         mmu_free_root_page(vcpu->kvm,
3758                                                            &mmu->pae_root[i],
3759                                                            &invalid_list);
3760                         mmu->root_hpa = INVALID_PAGE;
3761                 }
3762                 mmu->root_cr3 = 0;
3763         }
3764
3765         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3766         spin_unlock(&vcpu->kvm->mmu_lock);
3767 }
3768 EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
3769
3770 static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
3771 {
3772         int ret = 0;
3773
3774         if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
3775                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3776                 ret = 1;
3777         }
3778
3779         return ret;
3780 }
3781
3782 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3783 {
3784         struct kvm_mmu_page *sp;
3785         unsigned i;
3786
3787         if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
3788                 spin_lock(&vcpu->kvm->mmu_lock);
3789                 if(make_mmu_pages_available(vcpu) < 0) {
3790                         spin_unlock(&vcpu->kvm->mmu_lock);
3791                         return -ENOSPC;
3792                 }
3793                 sp = kvm_mmu_get_page(vcpu, 0, 0,
3794                                 vcpu->arch.mmu->shadow_root_level, 1, ACC_ALL);
3795                 ++sp->root_count;
3796                 spin_unlock(&vcpu->kvm->mmu_lock);
3797                 vcpu->arch.mmu->root_hpa = __pa(sp->spt);
3798         } else if (vcpu->arch.mmu->shadow_root_level == PT32E_ROOT_LEVEL) {
3799                 for (i = 0; i < 4; ++i) {
3800                         hpa_t root = vcpu->arch.mmu->pae_root[i];
3801
3802                         MMU_WARN_ON(VALID_PAGE(root));
3803                         spin_lock(&vcpu->kvm->mmu_lock);
3804                         if (make_mmu_pages_available(vcpu) < 0) {
3805                                 spin_unlock(&vcpu->kvm->mmu_lock);
3806                                 return -ENOSPC;
3807                         }
3808                         sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
3809                                         i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
3810                         root = __pa(sp->spt);
3811                         ++sp->root_count;
3812                         spin_unlock(&vcpu->kvm->mmu_lock);
3813                         vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
3814                 }
3815                 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
3816         } else
3817                 BUG();
3818         vcpu->arch.mmu->root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
3819
3820         return 0;
3821 }
3822
3823 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3824 {
3825         struct kvm_mmu_page *sp;
3826         u64 pdptr, pm_mask;
3827         gfn_t root_gfn, root_cr3;
3828         int i;
3829
3830         root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
3831         root_gfn = root_cr3 >> PAGE_SHIFT;
3832
3833         if (mmu_check_root(vcpu, root_gfn))
3834                 return 1;
3835
3836         /*
3837          * Do we shadow a long mode page table? If so we need to
3838          * write-protect the guests page table root.
3839          */
3840         if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
3841                 hpa_t root = vcpu->arch.mmu->root_hpa;
3842
3843                 MMU_WARN_ON(VALID_PAGE(root));
3844
3845                 spin_lock(&vcpu->kvm->mmu_lock);
3846                 if (make_mmu_pages_available(vcpu) < 0) {
3847                         spin_unlock(&vcpu->kvm->mmu_lock);
3848                         return -ENOSPC;
3849                 }
3850                 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
3851                                 vcpu->arch.mmu->shadow_root_level, 0, ACC_ALL);
3852                 root = __pa(sp->spt);
3853                 ++sp->root_count;
3854                 spin_unlock(&vcpu->kvm->mmu_lock);
3855                 vcpu->arch.mmu->root_hpa = root;
3856                 goto set_root_cr3;
3857         }
3858
3859         /*
3860          * We shadow a 32 bit page table. This may be a legacy 2-level
3861          * or a PAE 3-level page table. In either case we need to be aware that
3862          * the shadow page table may be a PAE or a long mode page table.
3863          */
3864         pm_mask = PT_PRESENT_MASK;
3865         if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL)
3866                 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
3867
3868         for (i = 0; i < 4; ++i) {
3869                 hpa_t root = vcpu->arch.mmu->pae_root[i];
3870
3871                 MMU_WARN_ON(VALID_PAGE(root));
3872                 if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
3873                         pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
3874                         if (!(pdptr & PT_PRESENT_MASK)) {
3875                                 vcpu->arch.mmu->pae_root[i] = 0;
3876                                 continue;
3877                         }
3878                         root_gfn = pdptr >> PAGE_SHIFT;
3879                         if (mmu_check_root(vcpu, root_gfn))
3880                                 return 1;
3881                 }
3882                 spin_lock(&vcpu->kvm->mmu_lock);
3883                 if (make_mmu_pages_available(vcpu) < 0) {
3884                         spin_unlock(&vcpu->kvm->mmu_lock);
3885                         return -ENOSPC;
3886                 }
3887                 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
3888                                       0, ACC_ALL);
3889                 root = __pa(sp->spt);
3890                 ++sp->root_count;
3891                 spin_unlock(&vcpu->kvm->mmu_lock);
3892
3893                 vcpu->arch.mmu->pae_root[i] = root | pm_mask;
3894         }
3895         vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
3896
3897         /*
3898          * If we shadow a 32 bit page table with a long mode page
3899          * table we enter this path.
3900          */
3901         if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
3902                 if (vcpu->arch.mmu->lm_root == NULL) {
3903                         /*
3904                          * The additional page necessary for this is only
3905                          * allocated on demand.
3906                          */
3907
3908                         u64 *lm_root;
3909
3910                         lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3911                         if (lm_root == NULL)
3912                                 return 1;
3913
3914                         lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask;
3915
3916                         vcpu->arch.mmu->lm_root = lm_root;
3917                 }
3918
3919                 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
3920         }
3921
3922 set_root_cr3:
3923         vcpu->arch.mmu->root_cr3 = root_cr3;
3924
3925         return 0;
3926 }
3927
3928 static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
3929 {
3930         if (vcpu->arch.mmu->direct_map)
3931                 return mmu_alloc_direct_roots(vcpu);
3932         else
3933                 return mmu_alloc_shadow_roots(vcpu);
3934 }
3935
3936 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
3937 {
3938         int i;
3939         struct kvm_mmu_page *sp;
3940
3941         if (vcpu->arch.mmu->direct_map)
3942                 return;
3943
3944         if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3945                 return;
3946
3947         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
3948
3949         if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
3950                 hpa_t root = vcpu->arch.mmu->root_hpa;
3951                 sp = page_header(root);
3952
3953                 /*
3954                  * Even if another CPU was marking the SP as unsync-ed
3955                  * simultaneously, any guest page table changes are not
3956                  * guaranteed to be visible anyway until this VCPU issues a TLB
3957                  * flush strictly after those changes are made. We only need to
3958                  * ensure that the other CPU sets these flags before any actual
3959                  * changes to the page tables are made. The comments in
3960                  * mmu_need_write_protect() describe what could go wrong if this
3961                  * requirement isn't satisfied.
3962                  */
3963                 if (!smp_load_acquire(&sp->unsync) &&
3964                     !smp_load_acquire(&sp->unsync_children))
3965                         return;
3966
3967                 spin_lock(&vcpu->kvm->mmu_lock);
3968                 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3969
3970                 mmu_sync_children(vcpu, sp);
3971
3972                 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3973                 spin_unlock(&vcpu->kvm->mmu_lock);
3974                 return;
3975         }
3976
3977         spin_lock(&vcpu->kvm->mmu_lock);
3978         kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3979
3980         for (i = 0; i < 4; ++i) {
3981                 hpa_t root = vcpu->arch.mmu->pae_root[i];
3982
3983                 if (root && VALID_PAGE(root)) {
3984                         root &= PT64_BASE_ADDR_MASK;
3985                         sp = page_header(root);
3986                         mmu_sync_children(vcpu, sp);
3987                 }
3988         }
3989
3990         kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3991         spin_unlock(&vcpu->kvm->mmu_lock);
3992 }
3993 EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
3994
3995 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr,
3996                                   u32 access, struct x86_exception *exception)
3997 {
3998         if (exception)
3999                 exception->error_code = 0;
4000         return vaddr;
4001 }
4002
4003 static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr,
4004                                          u32 access,
4005                                          struct x86_exception *exception)
4006 {
4007         if (exception)
4008                 exception->error_code = 0;
4009         return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
4010 }
4011
4012 static bool
4013 __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
4014 {
4015         int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f;
4016
4017         return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) |
4018                 ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0);
4019 }
4020
4021 static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
4022 {
4023         return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level);
4024 }
4025
4026 static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
4027 {
4028         return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
4029 }
4030
4031 static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
4032 {
4033         /*
4034          * A nested guest cannot use the MMIO cache if it is using nested
4035          * page tables, because cr2 is a nGPA while the cache stores GPAs.
4036          */
4037         if (mmu_is_nested(vcpu))
4038                 return false;
4039
4040         if (direct)
4041                 return vcpu_match_mmio_gpa(vcpu, addr);
4042
4043         return vcpu_match_mmio_gva(vcpu, addr);
4044 }
4045
4046 /* return true if reserved bit is detected on spte. */
4047 static bool
4048 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
4049 {
4050         struct kvm_shadow_walk_iterator iterator;
4051         u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
4052         int root, leaf;
4053         bool reserved = false;
4054
4055         if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
4056                 goto exit;
4057
4058         walk_shadow_page_lockless_begin(vcpu);
4059
4060         for (shadow_walk_init(&iterator, vcpu, addr),
4061                  leaf = root = iterator.level;
4062              shadow_walk_okay(&iterator);
4063              __shadow_walk_next(&iterator, spte)) {
4064                 spte = mmu_spte_get_lockless(iterator.sptep);
4065
4066                 sptes[leaf - 1] = spte;
4067                 leaf--;
4068
4069                 if (!is_shadow_present_pte(spte))
4070                         break;
4071
4072                 reserved |= is_shadow_zero_bits_set(vcpu->arch.mmu, spte,
4073                                                     iterator.level);
4074         }
4075
4076         walk_shadow_page_lockless_end(vcpu);
4077
4078         if (reserved) {
4079                 pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
4080                        __func__, addr);
4081                 while (root > leaf) {
4082                         pr_err("------ spte 0x%llx level %d.\n",
4083                                sptes[root - 1], root);
4084                         root--;
4085                 }
4086         }
4087 exit:
4088         *sptep = spte;
4089         return reserved;
4090 }
4091
4092 static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
4093 {
4094         u64 spte;
4095         bool reserved;
4096
4097         if (mmio_info_in_cache(vcpu, addr, direct))
4098                 return RET_PF_EMULATE;
4099
4100         reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
4101         if (WARN_ON(reserved))
4102                 return -EINVAL;
4103
4104         if (is_mmio_spte(spte)) {
4105                 gfn_t gfn = get_mmio_spte_gfn(spte);
4106                 unsigned access = get_mmio_spte_access(spte);
4107
4108                 if (!check_mmio_spte(vcpu, spte))
4109                         return RET_PF_INVALID;
4110
4111                 if (direct)
4112                         addr = 0;
4113
4114                 trace_handle_mmio_page_fault(addr, gfn, access);
4115                 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
4116                 return RET_PF_EMULATE;
4117         }
4118
4119         /*
4120          * If the page table is zapped by other cpus, let CPU fault again on
4121          * the address.
4122          */
4123         return RET_PF_RETRY;
4124 }
4125
4126 static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
4127                                          u32 error_code, gfn_t gfn)
4128 {
4129         if (unlikely(error_code & PFERR_RSVD_MASK))
4130                 return false;
4131
4132         if (!(error_code & PFERR_PRESENT_MASK) ||
4133               !(error_code & PFERR_WRITE_MASK))
4134                 return false;
4135
4136         /*
4137          * guest is writing the page which is write tracked which can
4138          * not be fixed by page fault handler.
4139          */
4140         if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
4141                 return true;
4142
4143         return false;
4144 }
4145
4146 static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
4147 {
4148         struct kvm_shadow_walk_iterator iterator;
4149         u64 spte;
4150
4151         if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
4152                 return;
4153
4154         walk_shadow_page_lockless_begin(vcpu);
4155         for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
4156                 clear_sp_write_flooding_count(iterator.sptep);
4157                 if (!is_shadow_present_pte(spte))
4158                         break;
4159         }
4160         walk_shadow_page_lockless_end(vcpu);
4161 }
4162
4163 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa,
4164                                 u32 error_code, bool prefault)
4165 {
4166         gfn_t gfn = gpa >> PAGE_SHIFT;
4167         int r;
4168
4169         /* Note, paging is disabled, ergo gva == gpa. */
4170         pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code);
4171
4172         if (page_fault_handle_page_track(vcpu, error_code, gfn))
4173                 return RET_PF_EMULATE;
4174
4175         r = mmu_topup_memory_caches(vcpu);
4176         if (r)
4177                 return r;
4178
4179         MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
4180
4181
4182         return nonpaging_map(vcpu, gpa & PAGE_MASK,
4183                              error_code, gfn, prefault);
4184 }
4185
4186 static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
4187                                    gfn_t gfn)
4188 {
4189         struct kvm_arch_async_pf arch;
4190
4191         arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
4192         arch.gfn = gfn;
4193         arch.direct_map = vcpu->arch.mmu->direct_map;
4194         arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu);
4195
4196         return kvm_setup_async_pf(vcpu, cr2_or_gpa,
4197                                   kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
4198 }
4199
4200 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
4201                          gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
4202                          bool *writable)
4203 {
4204         struct kvm_memory_slot *slot;
4205         bool async;
4206
4207         /*
4208          * Don't expose private memslots to L2.
4209          */
4210         if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) {
4211                 *pfn = KVM_PFN_NOSLOT;
4212                 return false;
4213         }
4214
4215         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
4216         async = false;
4217         *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
4218         if (!async)
4219                 return false; /* *pfn has correct page already */
4220
4221         if (!prefault && kvm_can_do_async_pf(vcpu)) {
4222                 trace_kvm_try_async_get_page(cr2_or_gpa, gfn);
4223                 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
4224                         trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
4225                         kvm_make_request(KVM_REQ_APF_HALT, vcpu);
4226                         return true;
4227                 } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
4228                         return true;
4229         }
4230
4231         *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
4232         return false;
4233 }
4234
4235 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
4236                                 u64 fault_address, char *insn, int insn_len)
4237 {
4238         int r = 1;
4239
4240 #ifndef CONFIG_X86_64
4241         /* A 64-bit CR2 should be impossible on 32-bit KVM. */
4242         if (WARN_ON_ONCE(fault_address >> 32))
4243                 return -EFAULT;
4244 #endif
4245
4246         vcpu->arch.l1tf_flush_l1d = true;
4247         switch (vcpu->arch.apf.host_apf_reason) {
4248         default:
4249                 trace_kvm_page_fault(fault_address, error_code);
4250
4251                 if (kvm_event_needs_reinjection(vcpu))
4252                         kvm_mmu_unprotect_page_virt(vcpu, fault_address);
4253                 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
4254                                 insn_len);
4255                 break;
4256         case KVM_PV_REASON_PAGE_NOT_PRESENT:
4257                 vcpu->arch.apf.host_apf_reason = 0;
4258                 local_irq_disable();
4259                 kvm_async_pf_task_wait(fault_address, 0);
4260                 local_irq_enable();
4261                 break;
4262         case KVM_PV_REASON_PAGE_READY:
4263                 vcpu->arch.apf.host_apf_reason = 0;
4264                 local_irq_disable();
4265                 kvm_async_pf_task_wake(fault_address);
4266                 local_irq_enable();
4267                 break;
4268         }
4269         return r;
4270 }
4271 EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
4272
4273 static bool
4274 check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
4275 {
4276         int page_num = KVM_PAGES_PER_HPAGE(level);
4277
4278         gfn &= ~(page_num - 1);
4279
4280         return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
4281 }
4282
4283 static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
4284                           bool prefault)
4285 {
4286         kvm_pfn_t pfn;
4287         int r;
4288         int level;
4289         bool force_pt_level;
4290         gfn_t gfn = gpa >> PAGE_SHIFT;
4291         unsigned long mmu_seq;
4292         int write = error_code & PFERR_WRITE_MASK;
4293         bool map_writable;
4294         bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
4295                                 is_nx_huge_page_enabled();
4296
4297         MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
4298
4299         if (page_fault_handle_page_track(vcpu, error_code, gfn))
4300                 return RET_PF_EMULATE;
4301
4302         r = mmu_topup_memory_caches(vcpu);
4303         if (r)
4304                 return r;
4305
4306         force_pt_level =
4307                 lpage_disallowed ||
4308                 !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL);
4309         level = mapping_level(vcpu, gfn, &force_pt_level);
4310         if (likely(!force_pt_level)) {
4311                 if (level > PT_DIRECTORY_LEVEL &&
4312                     !check_hugepage_cache_consistency(vcpu, gfn, level))
4313                         level = PT_DIRECTORY_LEVEL;
4314                 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
4315         }
4316
4317         if (fast_page_fault(vcpu, gpa, level, error_code))
4318                 return RET_PF_RETRY;
4319
4320         mmu_seq = vcpu->kvm->mmu_notifier_seq;
4321         smp_rmb();
4322
4323         if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
4324                 return RET_PF_RETRY;
4325
4326         if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
4327                 return r;
4328
4329         r = RET_PF_RETRY;
4330         spin_lock(&vcpu->kvm->mmu_lock);
4331         if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
4332                 goto out_unlock;
4333         if (make_mmu_pages_available(vcpu) < 0)
4334                 goto out_unlock;
4335         if (likely(!force_pt_level))
4336                 transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
4337         r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
4338                          prefault, lpage_disallowed);
4339 out_unlock:
4340         spin_unlock(&vcpu->kvm->mmu_lock);
4341         kvm_release_pfn_clean(pfn);
4342         return r;
4343 }
4344
4345 static void nonpaging_init_context(struct kvm_vcpu *vcpu,
4346                                    struct kvm_mmu *context)
4347 {
4348         context->page_fault = nonpaging_page_fault;
4349         context->gva_to_gpa = nonpaging_gva_to_gpa;
4350         context->sync_page = nonpaging_sync_page;
4351         context->invlpg = nonpaging_invlpg;
4352         context->root_level = 0;
4353         context->shadow_root_level = PT32E_ROOT_LEVEL;
4354         context->direct_map = true;
4355         context->nx = false;
4356 }
4357
4358 /*
4359  * Find out if a previously cached root matching the new CR3/role is available.
4360  * The current root is also inserted into the cache.
4361  * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is
4362  * returned.
4363  * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and
4364  * false is returned. This root should now be freed by the caller.
4365  */
4366 static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4367                                   union kvm_mmu_page_role new_role)
4368 {
4369         uint i;
4370         struct kvm_mmu_root_info root;
4371         struct kvm_mmu *mmu = vcpu->arch.mmu;
4372
4373         root.cr3 = mmu->root_cr3;
4374         root.hpa = mmu->root_hpa;
4375
4376         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
4377                 swap(root, mmu->prev_roots[i]);
4378
4379                 if (new_cr3 == root.cr3 && VALID_PAGE(root.hpa) &&
4380                     page_header(root.hpa) != NULL &&
4381                     new_role.word == page_header(root.hpa)->role.word)
4382                         break;
4383         }
4384
4385         mmu->root_hpa = root.hpa;
4386         mmu->root_cr3 = root.cr3;
4387
4388         return i < KVM_MMU_NUM_PREV_ROOTS;
4389 }
4390
4391 static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4392                             union kvm_mmu_page_role new_role,
4393                             bool skip_tlb_flush)
4394 {
4395         struct kvm_mmu *mmu = vcpu->arch.mmu;
4396
4397         /*
4398          * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid
4399          * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
4400          * later if necessary.
4401          */
4402         if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
4403             mmu->root_level >= PT64_ROOT_4LEVEL) {
4404                 if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
4405                         return false;
4406
4407                 if (cached_root_available(vcpu, new_cr3, new_role)) {
4408                         /*
4409                          * It is possible that the cached previous root page is
4410                          * obsolete because of a change in the MMU generation
4411                          * number. However, changing the generation number is
4412                          * accompanied by KVM_REQ_MMU_RELOAD, which will free
4413                          * the root set here and allocate a new one.
4414                          */
4415                         kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
4416                         if (!skip_tlb_flush) {
4417                                 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
4418                                 kvm_x86_ops->tlb_flush(vcpu, true);
4419                         }
4420
4421                         /*
4422                          * The last MMIO access's GVA and GPA are cached in the
4423                          * VCPU. When switching to a new CR3, that GVA->GPA
4424                          * mapping may no longer be valid. So clear any cached
4425                          * MMIO info even when we don't need to sync the shadow
4426                          * page tables.
4427                          */
4428                         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
4429
4430                         __clear_sp_write_flooding_count(
4431                                 page_header(mmu->root_hpa));
4432
4433                         return true;
4434                 }
4435         }
4436
4437         return false;
4438 }
4439
4440 static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4441                               union kvm_mmu_page_role new_role,
4442                               bool skip_tlb_flush)
4443 {
4444         if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush))
4445                 kvm_mmu_free_roots(vcpu, vcpu->arch.mmu,
4446                                    KVM_MMU_ROOT_CURRENT);
4447 }
4448
4449 void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
4450 {
4451         __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu),
4452                           skip_tlb_flush);
4453 }
4454 EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3);
4455
4456 static unsigned long get_cr3(struct kvm_vcpu *vcpu)
4457 {
4458         return kvm_read_cr3(vcpu);
4459 }
4460
4461 static void inject_page_fault(struct kvm_vcpu *vcpu,
4462                               struct x86_exception *fault)
4463 {
4464         vcpu->arch.mmu->inject_page_fault(vcpu, fault);
4465 }
4466
4467 static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
4468                            unsigned access, int *nr_present)
4469 {
4470         if (unlikely(is_mmio_spte(*sptep))) {
4471                 if (gfn != get_mmio_spte_gfn(*sptep)) {
4472                         mmu_spte_clear_no_track(sptep);
4473                         return true;
4474                 }
4475
4476                 (*nr_present)++;
4477                 mark_mmio_spte(vcpu, sptep, gfn, access);
4478                 return true;
4479         }
4480
4481         return false;
4482 }
4483
4484 static inline bool is_last_gpte(struct kvm_mmu *mmu,
4485                                 unsigned level, unsigned gpte)
4486 {
4487         /*
4488          * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
4489          * If it is clear, there are no large pages at this level, so clear
4490          * PT_PAGE_SIZE_MASK in gpte if that is the case.
4491          */
4492         gpte &= level - mmu->last_nonleaf_level;
4493
4494         /*
4495          * PT_PAGE_TABLE_LEVEL always terminates.  The RHS has bit 7 set
4496          * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
4497          * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
4498          */
4499         gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
4500
4501         return gpte & PT_PAGE_SIZE_MASK;
4502 }
4503
4504 #define PTTYPE_EPT 18 /* arbitrary */
4505 #define PTTYPE PTTYPE_EPT
4506 #include "paging_tmpl.h"
4507 #undef PTTYPE
4508
4509 #define PTTYPE 64
4510 #include "paging_tmpl.h"
4511 #undef PTTYPE
4512
4513 #define PTTYPE 32
4514 #include "paging_tmpl.h"
4515 #undef PTTYPE
4516
4517 static void
4518 __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4519                         struct rsvd_bits_validate *rsvd_check,
4520                         int maxphyaddr, int level, bool nx, bool gbpages,
4521                         bool pse, bool amd)
4522 {
4523         u64 exb_bit_rsvd = 0;
4524         u64 gbpages_bit_rsvd = 0;
4525         u64 nonleaf_bit8_rsvd = 0;
4526
4527         rsvd_check->bad_mt_xwr = 0;
4528
4529         if (!nx)
4530                 exb_bit_rsvd = rsvd_bits(63, 63);
4531         if (!gbpages)
4532                 gbpages_bit_rsvd = rsvd_bits(7, 7);
4533
4534         /*
4535          * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
4536          * leaf entries) on AMD CPUs only.
4537          */
4538         if (amd)
4539                 nonleaf_bit8_rsvd = rsvd_bits(8, 8);
4540
4541         switch (level) {
4542         case PT32_ROOT_LEVEL:
4543                 /* no rsvd bits for 2 level 4K page table entries */
4544                 rsvd_check->rsvd_bits_mask[0][1] = 0;
4545                 rsvd_check->rsvd_bits_mask[0][0] = 0;
4546                 rsvd_check->rsvd_bits_mask[1][0] =
4547                         rsvd_check->rsvd_bits_mask[0][0];
4548
4549                 if (!pse) {
4550                         rsvd_check->rsvd_bits_mask[1][1] = 0;
4551                         break;
4552                 }
4553
4554                 if (is_cpuid_PSE36())
4555                         /* 36bits PSE 4MB page */
4556                         rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
4557                 else
4558                         /* 32 bits PSE 4MB page */
4559                         rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
4560                 break;
4561         case PT32E_ROOT_LEVEL:
4562                 rsvd_check->rsvd_bits_mask[0][2] =
4563                         rsvd_bits(maxphyaddr, 63) |
4564                         rsvd_bits(5, 8) | rsvd_bits(1, 2);      /* PDPTE */
4565                 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
4566                         rsvd_bits(maxphyaddr, 62);      /* PDE */
4567                 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
4568                         rsvd_bits(maxphyaddr, 62);      /* PTE */
4569                 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
4570                         rsvd_bits(maxphyaddr, 62) |
4571                         rsvd_bits(13, 20);              /* large page */
4572                 rsvd_check->rsvd_bits_mask[1][0] =
4573                         rsvd_check->rsvd_bits_mask[0][0];
4574                 break;
4575         case PT64_ROOT_5LEVEL:
4576                 rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
4577                         nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4578                         rsvd_bits(maxphyaddr, 51);
4579                 rsvd_check->rsvd_bits_mask[1][4] =
4580                         rsvd_check->rsvd_bits_mask[0][4];
4581                 /* fall through */
4582         case PT64_ROOT_4LEVEL:
4583                 rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
4584                         nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4585                         rsvd_bits(maxphyaddr, 51);
4586                 rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
4587                         gbpages_bit_rsvd |
4588                         rsvd_bits(maxphyaddr, 51);
4589                 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
4590                         rsvd_bits(maxphyaddr, 51);
4591                 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
4592                         rsvd_bits(maxphyaddr, 51);
4593                 rsvd_check->rsvd_bits_mask[1][3] =
4594                         rsvd_check->rsvd_bits_mask[0][3];
4595                 rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |
4596                         gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
4597                         rsvd_bits(13, 29);
4598                 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
4599                         rsvd_bits(maxphyaddr, 51) |
4600                         rsvd_bits(13, 20);              /* large page */
4601                 rsvd_check->rsvd_bits_mask[1][0] =
4602                         rsvd_check->rsvd_bits_mask[0][0];
4603                 break;
4604         }
4605 }
4606
4607 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4608                                   struct kvm_mmu *context)
4609 {
4610         __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
4611                                 cpuid_maxphyaddr(vcpu), context->root_level,
4612                                 context->nx,
4613                                 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4614                                 is_pse(vcpu), guest_cpuid_is_amd(vcpu));
4615 }
4616
4617 static void
4618 __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
4619                             int maxphyaddr, bool execonly)
4620 {
4621         u64 bad_mt_xwr;
4622
4623         rsvd_check->rsvd_bits_mask[0][4] =
4624                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
4625         rsvd_check->rsvd_bits_mask[0][3] =
4626                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
4627         rsvd_check->rsvd_bits_mask[0][2] =
4628                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
4629         rsvd_check->rsvd_bits_mask[0][1] =
4630                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
4631         rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
4632
4633         /* large page */
4634         rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
4635         rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
4636         rsvd_check->rsvd_bits_mask[1][2] =
4637                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
4638         rsvd_check->rsvd_bits_mask[1][1] =
4639                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
4640         rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
4641
4642         bad_mt_xwr = 0xFFull << (2 * 8);        /* bits 3..5 must not be 2 */
4643         bad_mt_xwr |= 0xFFull << (3 * 8);       /* bits 3..5 must not be 3 */
4644         bad_mt_xwr |= 0xFFull << (7 * 8);       /* bits 3..5 must not be 7 */
4645         bad_mt_xwr |= REPEAT_BYTE(1ull << 2);   /* bits 0..2 must not be 010 */
4646         bad_mt_xwr |= REPEAT_BYTE(1ull << 6);   /* bits 0..2 must not be 110 */
4647         if (!execonly) {
4648                 /* bits 0..2 must not be 100 unless VMX capabilities allow it */
4649                 bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
4650         }
4651         rsvd_check->bad_mt_xwr = bad_mt_xwr;
4652 }
4653
4654 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
4655                 struct kvm_mmu *context, bool execonly)
4656 {
4657         __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
4658                                     cpuid_maxphyaddr(vcpu), execonly);
4659 }
4660
4661 /*
4662  * the page table on host is the shadow page table for the page
4663  * table in guest or amd nested guest, its mmu features completely
4664  * follow the features in guest.
4665  */
4666 void
4667 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
4668 {
4669         /*
4670          * KVM uses NX when TDP is disabled to handle a variety of scenarios,
4671          * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
4672          * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
4673          * The iTLB multi-hit workaround can be toggled at any time, so assume
4674          * NX can be used by any non-nested shadow MMU to avoid having to reset
4675          * MMU contexts.  Note, KVM forces EFER.NX=1 when TDP is disabled.
4676          */
4677         bool uses_nx = context->nx || !tdp_enabled ||
4678                 context->mmu_role.base.smep_andnot_wp;
4679         struct rsvd_bits_validate *shadow_zero_check;
4680         int i;
4681
4682         /*
4683          * Passing "true" to the last argument is okay; it adds a check
4684          * on bit 8 of the SPTEs which KVM doesn't use anyway.
4685          */
4686         shadow_zero_check = &context->shadow_zero_check;
4687         __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4688                                 shadow_phys_bits,
4689                                 context->shadow_root_level, uses_nx,
4690                                 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4691                                 is_pse(vcpu), true);
4692
4693         if (!shadow_me_mask)
4694                 return;
4695
4696         for (i = context->shadow_root_level; --i >= 0;) {
4697                 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4698                 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4699         }
4700
4701 }
4702 EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
4703
4704 static inline bool boot_cpu_is_amd(void)
4705 {
4706         WARN_ON_ONCE(!tdp_enabled);
4707         return shadow_x_mask == 0;
4708 }
4709
4710 /*
4711  * the direct page table on host, use as much mmu features as
4712  * possible, however, kvm currently does not do execution-protection.
4713  */
4714 static void
4715 reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4716                                 struct kvm_mmu *context)
4717 {
4718         struct rsvd_bits_validate *shadow_zero_check;
4719         int i;
4720
4721         shadow_zero_check = &context->shadow_zero_check;
4722
4723         if (boot_cpu_is_amd())
4724                 __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4725                                         shadow_phys_bits,
4726                                         context->shadow_root_level, false,
4727                                         boot_cpu_has(X86_FEATURE_GBPAGES),
4728                                         true, true);
4729         else
4730                 __reset_rsvds_bits_mask_ept(shadow_zero_check,
4731                                             shadow_phys_bits,
4732                                             false);
4733
4734         if (!shadow_me_mask)
4735                 return;
4736
4737         for (i = context->shadow_root_level; --i >= 0;) {
4738                 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4739                 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4740         }
4741 }
4742
4743 /*
4744  * as the comments in reset_shadow_zero_bits_mask() except it
4745  * is the shadow page table for intel nested guest.
4746  */
4747 static void
4748 reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4749                                 struct kvm_mmu *context, bool execonly)
4750 {
4751         __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
4752                                     shadow_phys_bits, execonly);
4753 }
4754
4755 #define BYTE_MASK(access) \
4756         ((1 & (access) ? 2 : 0) | \
4757          (2 & (access) ? 4 : 0) | \
4758          (3 & (access) ? 8 : 0) | \
4759          (4 & (access) ? 16 : 0) | \
4760          (5 & (access) ? 32 : 0) | \
4761          (6 & (access) ? 64 : 0) | \
4762          (7 & (access) ? 128 : 0))
4763
4764
4765 static void update_permission_bitmask(struct kvm_vcpu *vcpu,
4766                                       struct kvm_mmu *mmu, bool ept)
4767 {
4768         unsigned byte;
4769
4770         const u8 x = BYTE_MASK(ACC_EXEC_MASK);
4771         const u8 w = BYTE_MASK(ACC_WRITE_MASK);
4772         const u8 u = BYTE_MASK(ACC_USER_MASK);
4773
4774         bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0;
4775         bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0;
4776         bool cr0_wp = is_write_protection(vcpu);
4777
4778         for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
4779                 unsigned pfec = byte << 1;
4780
4781                 /*
4782                  * Each "*f" variable has a 1 bit for each UWX value
4783                  * that causes a fault with the given PFEC.
4784                  */
4785
4786                 /* Faults from writes to non-writable pages */
4787                 u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
4788                 /* Faults from user mode accesses to supervisor pages */
4789                 u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
4790                 /* Faults from fetches of non-executable pages*/
4791                 u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
4792                 /* Faults from kernel mode fetches of user pages */
4793                 u8 smepf = 0;
4794                 /* Faults from kernel mode accesses of user pages */
4795                 u8 smapf = 0;
4796
4797                 if (!ept) {
4798                         /* Faults from kernel mode accesses to user pages */
4799                         u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
4800
4801                         /* Not really needed: !nx will cause pte.nx to fault */
4802                         if (!mmu->nx)
4803                                 ff = 0;
4804
4805                         /* Allow supervisor writes if !cr0.wp */
4806                         if (!cr0_wp)
4807                                 wf = (pfec & PFERR_USER_MASK) ? wf : 0;
4808
4809                         /* Disallow supervisor fetches of user code if cr4.smep */
4810                         if (cr4_smep)
4811                                 smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
4812
4813                         /*
4814                          * SMAP:kernel-mode data accesses from user-mode
4815                          * mappings should fault. A fault is considered
4816                          * as a SMAP violation if all of the following
4817                          * conditions are true:
4818                          *   - X86_CR4_SMAP is set in CR4
4819                          *   - A user page is accessed
4820                          *   - The access is not a fetch
4821                          *   - Page fault in kernel mode
4822                          *   - if CPL = 3 or X86_EFLAGS_AC is clear
4823                          *
4824                          * Here, we cover the first three conditions.
4825                          * The fourth is computed dynamically in permission_fault();
4826                          * PFERR_RSVD_MASK bit will be set in PFEC if the access is
4827                          * *not* subject to SMAP restrictions.
4828                          */
4829                         if (cr4_smap)
4830                                 smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
4831                 }
4832
4833                 mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
4834         }
4835 }
4836
4837 /*
4838 * PKU is an additional mechanism by which the paging controls access to
4839 * user-mode addresses based on the value in the PKRU register.  Protection
4840 * key violations are reported through a bit in the page fault error code.
4841 * Unlike other bits of the error code, the PK bit is not known at the
4842 * call site of e.g. gva_to_gpa; it must be computed directly in
4843 * permission_fault based on two bits of PKRU, on some machine state (CR4,
4844 * CR0, EFER, CPL), and on other bits of the error code and the page tables.
4845 *
4846 * In particular the following conditions come from the error code, the
4847 * page tables and the machine state:
4848 * - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
4849 * - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
4850 * - PK is always zero if U=0 in the page tables
4851 * - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
4852 *
4853 * The PKRU bitmask caches the result of these four conditions.  The error
4854 * code (minus the P bit) and the page table's U bit form an index into the
4855 * PKRU bitmask.  Two bits of the PKRU bitmask are then extracted and ANDed
4856 * with the two bits of the PKRU register corresponding to the protection key.
4857 * For the first three conditions above the bits will be 00, thus masking
4858 * away both AD and WD.  For all reads or if the last condition holds, WD
4859 * only will be masked away.
4860 */
4861 static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
4862                                 bool ept)
4863 {
4864         unsigned bit;
4865         bool wp;
4866
4867         if (ept) {
4868                 mmu->pkru_mask = 0;
4869                 return;
4870         }
4871
4872         /* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */
4873         if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) {
4874                 mmu->pkru_mask = 0;
4875                 return;
4876         }
4877
4878         wp = is_write_protection(vcpu);
4879
4880         for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
4881                 unsigned pfec, pkey_bits;
4882                 bool check_pkey, check_write, ff, uf, wf, pte_user;
4883
4884                 pfec = bit << 1;
4885                 ff = pfec & PFERR_FETCH_MASK;
4886                 uf = pfec & PFERR_USER_MASK;
4887                 wf = pfec & PFERR_WRITE_MASK;
4888
4889                 /* PFEC.RSVD is replaced by ACC_USER_MASK. */
4890                 pte_user = pfec & PFERR_RSVD_MASK;
4891
4892                 /*
4893                  * Only need to check the access which is not an
4894                  * instruction fetch and is to a user page.
4895                  */
4896                 check_pkey = (!ff && pte_user);
4897                 /*
4898                  * write access is controlled by PKRU if it is a
4899                  * user access or CR0.WP = 1.
4900                  */
4901                 check_write = check_pkey && wf && (uf || wp);
4902
4903                 /* PKRU.AD stops both read and write access. */
4904                 pkey_bits = !!check_pkey;
4905                 /* PKRU.WD stops write access. */
4906                 pkey_bits |= (!!check_write) << 1;
4907
4908                 mmu->pkru_mask |= (pkey_bits & 3) << pfec;
4909         }
4910 }
4911
4912 static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
4913 {
4914         unsigned root_level = mmu->root_level;
4915
4916         mmu->last_nonleaf_level = root_level;
4917         if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
4918                 mmu->last_nonleaf_level++;
4919 }
4920
4921 static void paging64_init_context_common(struct kvm_vcpu *vcpu,
4922                                          struct kvm_mmu *context,
4923                                          int level)
4924 {
4925         context->nx = is_nx(vcpu);
4926         context->root_level = level;
4927
4928         reset_rsvds_bits_mask(vcpu, context);
4929         update_permission_bitmask(vcpu, context, false);
4930         update_pkru_bitmask(vcpu, context, false);
4931         update_last_nonleaf_level(vcpu, context);
4932
4933         MMU_WARN_ON(!is_pae(vcpu));
4934         context->page_fault = paging64_page_fault;
4935         context->gva_to_gpa = paging64_gva_to_gpa;
4936         context->sync_page = paging64_sync_page;
4937         context->invlpg = paging64_invlpg;
4938         context->shadow_root_level = level;
4939         context->direct_map = false;
4940 }
4941
4942 static void paging64_init_context(struct kvm_vcpu *vcpu,
4943                                   struct kvm_mmu *context)
4944 {
4945         int root_level = is_la57_mode(vcpu) ?
4946                          PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4947
4948         paging64_init_context_common(vcpu, context, root_level);
4949 }
4950
4951 static void paging32_init_context(struct kvm_vcpu *vcpu,
4952                                   struct kvm_mmu *context)
4953 {
4954         context->nx = false;
4955         context->root_level = PT32_ROOT_LEVEL;
4956
4957         reset_rsvds_bits_mask(vcpu, context);
4958         update_permission_bitmask(vcpu, context, false);
4959         update_pkru_bitmask(vcpu, context, false);
4960         update_last_nonleaf_level(vcpu, context);
4961
4962         context->page_fault = paging32_page_fault;
4963         context->gva_to_gpa = paging32_gva_to_gpa;
4964         context->sync_page = paging32_sync_page;
4965         context->invlpg = paging32_invlpg;
4966         context->shadow_root_level = PT32E_ROOT_LEVEL;
4967         context->direct_map = false;
4968 }
4969
4970 static void paging32E_init_context(struct kvm_vcpu *vcpu,
4971                                    struct kvm_mmu *context)
4972 {
4973         paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
4974 }
4975
4976 static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
4977 {
4978         union kvm_mmu_extended_role ext = {0};
4979
4980         ext.cr0_pg = !!is_paging(vcpu);
4981         ext.cr4_pae = !!is_pae(vcpu);
4982         ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
4983         ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
4984         ext.cr4_pse = !!is_pse(vcpu);
4985         ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
4986         ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
4987         ext.maxphyaddr = cpuid_maxphyaddr(vcpu);
4988
4989         ext.valid = 1;
4990
4991         return ext;
4992 }
4993
4994 static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
4995                                                    bool base_only)
4996 {
4997         union kvm_mmu_role role = {0};
4998
4999         role.base.access = ACC_ALL;
5000         role.base.nxe = !!is_nx(vcpu);
5001         role.base.cr0_wp = is_write_protection(vcpu);
5002         role.base.smm = is_smm(vcpu);
5003         role.base.guest_mode = is_guest_mode(vcpu);
5004
5005         if (base_only)
5006                 return role;
5007
5008         role.ext = kvm_calc_mmu_role_ext(vcpu);
5009
5010         return role;
5011 }
5012
5013 static union kvm_mmu_role
5014 kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
5015 {
5016         union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
5017
5018         role.base.ad_disabled = (shadow_accessed_mask == 0);
5019         role.base.level = kvm_x86_ops->get_tdp_level(vcpu);
5020         role.base.direct = true;
5021         role.base.gpte_is_8_bytes = true;
5022
5023         return role;
5024 }
5025
5026 static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
5027 {
5028         struct kvm_mmu *context = vcpu->arch.mmu;
5029         union kvm_mmu_role new_role =
5030                 kvm_calc_tdp_mmu_root_page_role(vcpu, false);
5031
5032         new_role.base.word &= mmu_base_role_mask.word;
5033         if (new_role.as_u64 == context->mmu_role.as_u64)
5034                 return;
5035
5036         context->mmu_role.as_u64 = new_role.as_u64;
5037         context->page_fault = tdp_page_fault;
5038         context->sync_page = nonpaging_sync_page;
5039         context->invlpg = nonpaging_invlpg;
5040         context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
5041         context->direct_map = true;
5042         context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
5043         context->get_cr3 = get_cr3;
5044         context->get_pdptr = kvm_pdptr_read;
5045         context->inject_page_fault = kvm_inject_page_fault;
5046
5047         if (!is_paging(vcpu)) {
5048                 context->nx = false;
5049                 context->gva_to_gpa = nonpaging_gva_to_gpa;
5050                 context->root_level = 0;
5051         } else if (is_long_mode(vcpu)) {
5052                 context->nx = is_nx(vcpu);
5053                 context->root_level = is_la57_mode(vcpu) ?
5054                                 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
5055                 reset_rsvds_bits_mask(vcpu, context);
5056                 context->gva_to_gpa = paging64_gva_to_gpa;
5057         } else if (is_pae(vcpu)) {
5058                 context->nx = is_nx(vcpu);
5059                 context->root_level = PT32E_ROOT_LEVEL;
5060                 reset_rsvds_bits_mask(vcpu, context);
5061                 context->gva_to_gpa = paging64_gva_to_gpa;
5062         } else {
5063                 context->nx = false;
5064                 context->root_level = PT32_ROOT_LEVEL;
5065                 reset_rsvds_bits_mask(vcpu, context);
5066                 context->gva_to_gpa = paging32_gva_to_gpa;
5067         }
5068
5069         update_permission_bitmask(vcpu, context, false);
5070         update_pkru_bitmask(vcpu, context, false);
5071         update_last_nonleaf_level(vcpu, context);
5072         reset_tdp_shadow_zero_bits_mask(vcpu, context);
5073 }
5074
5075 static union kvm_mmu_role
5076 kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
5077 {
5078         union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
5079
5080         role.base.smep_andnot_wp = role.ext.cr4_smep &&
5081                 !is_write_protection(vcpu);
5082         role.base.smap_andnot_wp = role.ext.cr4_smap &&
5083                 !is_write_protection(vcpu);
5084         role.base.direct = !is_paging(vcpu);
5085         role.base.gpte_is_8_bytes = !!is_pae(vcpu);
5086
5087         if (!is_long_mode(vcpu))
5088                 role.base.level = PT32E_ROOT_LEVEL;
5089         else if (is_la57_mode(vcpu))
5090                 role.base.level = PT64_ROOT_5LEVEL;
5091         else
5092                 role.base.level = PT64_ROOT_4LEVEL;
5093
5094         return role;
5095 }
5096
5097 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
5098 {
5099         struct kvm_mmu *context = vcpu->arch.mmu;
5100         union kvm_mmu_role new_role =
5101                 kvm_calc_shadow_mmu_root_page_role(vcpu, false);
5102
5103         new_role.base.word &= mmu_base_role_mask.word;
5104         if (new_role.as_u64 == context->mmu_role.as_u64)
5105                 return;
5106
5107         if (!is_paging(vcpu))
5108                 nonpaging_init_context(vcpu, context);
5109         else if (is_long_mode(vcpu))
5110                 paging64_init_context(vcpu, context);
5111         else if (is_pae(vcpu))
5112                 paging32E_init_context(vcpu, context);
5113         else
5114                 paging32_init_context(vcpu, context);
5115
5116         context->mmu_role.as_u64 = new_role.as_u64;
5117         reset_shadow_zero_bits_mask(vcpu, context);
5118 }
5119 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
5120
5121 static union kvm_mmu_role
5122 kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
5123                                    bool execonly)
5124 {
5125         union kvm_mmu_role role = {0};
5126
5127         /* SMM flag is inherited from root_mmu */
5128         role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
5129
5130         role.base.level = PT64_ROOT_4LEVEL;
5131         role.base.gpte_is_8_bytes = true;
5132         role.base.direct = false;
5133         role.base.ad_disabled = !accessed_dirty;
5134         role.base.guest_mode = true;
5135         role.base.access = ACC_ALL;
5136
5137         /*
5138          * WP=1 and NOT_WP=1 is an impossible combination, use WP and the
5139          * SMAP variation to denote shadow EPT entries.
5140          */
5141         role.base.cr0_wp = true;
5142         role.base.smap_andnot_wp = true;
5143
5144         role.ext = kvm_calc_mmu_role_ext(vcpu);
5145         role.ext.execonly = execonly;
5146
5147         return role;
5148 }
5149
5150 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
5151                              bool accessed_dirty, gpa_t new_eptp)
5152 {
5153         struct kvm_mmu *context = vcpu->arch.mmu;
5154         union kvm_mmu_role new_role =
5155                 kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
5156                                                    execonly);
5157
5158         __kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false);
5159
5160         new_role.base.word &= mmu_base_role_mask.word;
5161         if (new_role.as_u64 == context->mmu_role.as_u64)
5162                 return;
5163
5164         context->shadow_root_level = PT64_ROOT_4LEVEL;
5165
5166         context->nx = true;
5167         context->ept_ad = accessed_dirty;
5168         context->page_fault = ept_page_fault;
5169         context->gva_to_gpa = ept_gva_to_gpa;
5170         context->sync_page = ept_sync_page;
5171         context->invlpg = ept_invlpg;
5172         context->root_level = PT64_ROOT_4LEVEL;
5173         context->direct_map = false;
5174         context->mmu_role.as_u64 = new_role.as_u64;
5175
5176         update_permission_bitmask(vcpu, context, true);
5177         update_pkru_bitmask(vcpu, context, true);
5178         update_last_nonleaf_level(vcpu, context);
5179         reset_rsvds_bits_mask_ept(vcpu, context, execonly);
5180         reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
5181 }
5182 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
5183
5184 static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
5185 {
5186         struct kvm_mmu *context = vcpu->arch.mmu;
5187
5188         kvm_init_shadow_mmu(vcpu);
5189         context->set_cr3           = kvm_x86_ops->set_cr3;
5190         context->get_cr3           = get_cr3;
5191         context->get_pdptr         = kvm_pdptr_read;
5192         context->inject_page_fault = kvm_inject_page_fault;
5193 }
5194
5195 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
5196 {
5197         union kvm_mmu_role new_role = kvm_calc_mmu_role_common(vcpu, false);
5198         struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
5199
5200         new_role.base.word &= mmu_base_role_mask.word;
5201         if (new_role.as_u64 == g_context->mmu_role.as_u64)
5202                 return;
5203
5204         g_context->mmu_role.as_u64 = new_role.as_u64;
5205         g_context->get_cr3           = get_cr3;
5206         g_context->get_pdptr         = kvm_pdptr_read;
5207         g_context->inject_page_fault = kvm_inject_page_fault;
5208
5209         /*
5210          * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
5211          * L1's nested page tables (e.g. EPT12). The nested translation
5212          * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
5213          * L2's page tables as the first level of translation and L1's
5214          * nested page tables as the second level of translation. Basically
5215          * the gva_to_gpa functions between mmu and nested_mmu are swapped.
5216          */
5217         if (!is_paging(vcpu)) {
5218                 g_context->nx = false;
5219                 g_context->root_level = 0;
5220                 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
5221         } else if (is_long_mode(vcpu)) {
5222                 g_context->nx = is_nx(vcpu);
5223                 g_context->root_level = is_la57_mode(vcpu) ?
5224                                         PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
5225                 reset_rsvds_bits_mask(vcpu, g_context);
5226                 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
5227         } else if (is_pae(vcpu)) {
5228                 g_context->nx = is_nx(vcpu);
5229                 g_context->root_level = PT32E_ROOT_LEVEL;
5230                 reset_rsvds_bits_mask(vcpu, g_context);
5231                 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
5232         } else {
5233                 g_context->nx = false;
5234                 g_context->root_level = PT32_ROOT_LEVEL;
5235                 reset_rsvds_bits_mask(vcpu, g_context);
5236                 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
5237         }
5238
5239         update_permission_bitmask(vcpu, g_context, false);
5240         update_pkru_bitmask(vcpu, g_context, false);
5241         update_last_nonleaf_level(vcpu, g_context);
5242 }
5243
5244 void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
5245 {
5246         if (reset_roots) {
5247                 uint i;
5248
5249                 vcpu->arch.mmu->root_hpa = INVALID_PAGE;
5250
5251                 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5252                         vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5253         }
5254
5255         if (mmu_is_nested(vcpu))
5256                 init_kvm_nested_mmu(vcpu);
5257         else if (tdp_enabled)
5258                 init_kvm_tdp_mmu(vcpu);
5259         else
5260                 init_kvm_softmmu(vcpu);
5261 }
5262 EXPORT_SYMBOL_GPL(kvm_init_mmu);
5263
5264 static union kvm_mmu_page_role
5265 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
5266 {
5267         union kvm_mmu_role role;
5268
5269         if (tdp_enabled)
5270                 role = kvm_calc_tdp_mmu_root_page_role(vcpu, true);
5271         else
5272                 role = kvm_calc_shadow_mmu_root_page_role(vcpu, true);
5273
5274         return role.base;
5275 }
5276
5277 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
5278 {
5279         kvm_mmu_unload(vcpu);
5280         kvm_init_mmu(vcpu, true);
5281 }
5282 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
5283
5284 int kvm_mmu_load(struct kvm_vcpu *vcpu)
5285 {
5286         int r;
5287
5288         r = mmu_topup_memory_caches(vcpu);
5289         if (r)
5290                 goto out;
5291         r = mmu_alloc_roots(vcpu);
5292         kvm_mmu_sync_roots(vcpu);
5293         if (r)
5294                 goto out;
5295         kvm_mmu_load_cr3(vcpu);
5296         kvm_x86_ops->tlb_flush(vcpu, true);
5297 out:
5298         return r;
5299 }
5300 EXPORT_SYMBOL_GPL(kvm_mmu_load);
5301
5302 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
5303 {
5304         kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
5305         WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa));
5306         kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
5307         WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
5308 }
5309 EXPORT_SYMBOL_GPL(kvm_mmu_unload);
5310
5311 static bool need_remote_flush(u64 old, u64 new)
5312 {
5313         if (!is_shadow_present_pte(old))
5314                 return false;
5315         if (!is_shadow_present_pte(new))
5316                 return true;
5317         if ((old ^ new) & PT64_BASE_ADDR_MASK)
5318                 return true;
5319         old ^= shadow_nx_mask;
5320         new ^= shadow_nx_mask;
5321         return (old & ~new & PT64_PERM_MASK) != 0;
5322 }
5323
5324 static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
5325                                     int *bytes)
5326 {
5327         u64 gentry = 0;
5328         int r;
5329
5330         /*
5331          * Assume that the pte write on a page table of the same type
5332          * as the current vcpu paging mode since we update the sptes only
5333          * when they have the same mode.
5334          */
5335         if (is_pae(vcpu) && *bytes == 4) {
5336                 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
5337                 *gpa &= ~(gpa_t)7;
5338                 *bytes = 8;
5339         }
5340
5341         if (*bytes == 4 || *bytes == 8) {
5342                 r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
5343                 if (r)
5344                         gentry = 0;
5345         }
5346
5347         return gentry;
5348 }
5349
5350 /*
5351  * If we're seeing too many writes to a page, it may no longer be a page table,
5352  * or we may be forking, in which case it is better to unmap the page.
5353  */
5354 static bool detect_write_flooding(struct kvm_mmu_page *sp)
5355 {
5356         /*
5357          * Skip write-flooding detected for the sp whose level is 1, because
5358          * it can become unsync, then the guest page is not write-protected.
5359          */
5360         if (sp->role.level == PT_PAGE_TABLE_LEVEL)
5361                 return false;
5362
5363         atomic_inc(&sp->write_flooding_count);
5364         return atomic_read(&sp->write_flooding_count) >= 3;
5365 }
5366
5367 /*
5368  * Misaligned accesses are too much trouble to fix up; also, they usually
5369  * indicate a page is not used as a page table.
5370  */
5371 static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
5372                                     int bytes)
5373 {
5374         unsigned offset, pte_size, misaligned;
5375
5376         pgprintk("misaligned: gpa %llx bytes %d role %x\n",
5377                  gpa, bytes, sp->role.word);
5378
5379         offset = offset_in_page(gpa);
5380         pte_size = sp->role.gpte_is_8_bytes ? 8 : 4;
5381
5382         /*
5383          * Sometimes, the OS only writes the last one bytes to update status
5384          * bits, for example, in linux, andb instruction is used in clear_bit().
5385          */
5386         if (!(offset & (pte_size - 1)) && bytes == 1)
5387                 return false;
5388
5389         misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
5390         misaligned |= bytes < 4;
5391
5392         return misaligned;
5393 }
5394
5395 static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
5396 {
5397         unsigned page_offset, quadrant;
5398         u64 *spte;
5399         int level;
5400
5401         page_offset = offset_in_page(gpa);
5402         level = sp->role.level;
5403         *nspte = 1;
5404         if (!sp->role.gpte_is_8_bytes) {
5405                 page_offset <<= 1;      /* 32->64 */
5406                 /*
5407                  * A 32-bit pde maps 4MB while the shadow pdes map
5408                  * only 2MB.  So we need to double the offset again
5409                  * and zap two pdes instead of one.
5410                  */
5411                 if (level == PT32_ROOT_LEVEL) {
5412                         page_offset &= ~7; /* kill rounding error */
5413                         page_offset <<= 1;
5414                         *nspte = 2;
5415                 }
5416                 quadrant = page_offset >> PAGE_SHIFT;
5417                 page_offset &= ~PAGE_MASK;
5418                 if (quadrant != sp->role.quadrant)
5419                         return NULL;
5420         }
5421
5422         spte = &sp->spt[page_offset / sizeof(*spte)];
5423         return spte;
5424 }
5425
5426 static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
5427                               const u8 *new, int bytes,
5428                               struct kvm_page_track_notifier_node *node)
5429 {
5430         gfn_t gfn = gpa >> PAGE_SHIFT;
5431         struct kvm_mmu_page *sp;
5432         LIST_HEAD(invalid_list);
5433         u64 entry, gentry, *spte;
5434         int npte;
5435         bool remote_flush, local_flush;
5436
5437         /*
5438          * If we don't have indirect shadow pages, it means no page is
5439          * write-protected, so we can exit simply.
5440          */
5441         if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
5442                 return;
5443
5444         remote_flush = local_flush = false;
5445
5446         pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
5447
5448         /*
5449          * No need to care whether allocation memory is successful
5450          * or not since pte prefetch is skiped if it does not have
5451          * enough objects in the cache.
5452          */
5453         mmu_topup_memory_caches(vcpu);
5454
5455         spin_lock(&vcpu->kvm->mmu_lock);
5456
5457         gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
5458
5459         ++vcpu->kvm->stat.mmu_pte_write;
5460         kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
5461
5462         for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
5463                 if (detect_write_misaligned(sp, gpa, bytes) ||
5464                       detect_write_flooding(sp)) {
5465                         kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
5466                         ++vcpu->kvm->stat.mmu_flooded;
5467                         continue;
5468                 }
5469
5470                 spte = get_written_sptes(sp, gpa, &npte);
5471                 if (!spte)
5472                         continue;
5473
5474                 local_flush = true;
5475                 while (npte--) {
5476                         entry = *spte;
5477                         mmu_page_zap_pte(vcpu->kvm, sp, spte);
5478                         if (gentry && sp->role.level != PG_LEVEL_4K)
5479                                 ++vcpu->kvm->stat.mmu_pde_zapped;
5480                         if (need_remote_flush(entry, *spte))
5481                                 remote_flush = true;
5482                         ++spte;
5483                 }
5484         }
5485         kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
5486         kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
5487         spin_unlock(&vcpu->kvm->mmu_lock);
5488 }
5489
5490 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
5491 {
5492         gpa_t gpa;
5493         int r;
5494
5495         if (vcpu->arch.mmu->direct_map)
5496                 return 0;
5497
5498         gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
5499
5500         r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
5501
5502         return r;
5503 }
5504 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
5505
5506 static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
5507 {
5508         LIST_HEAD(invalid_list);
5509
5510         if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
5511                 return 0;
5512
5513         while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
5514                 if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
5515                         break;
5516
5517                 ++vcpu->kvm->stat.mmu_recycled;
5518         }
5519         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
5520
5521         if (!kvm_mmu_available_pages(vcpu->kvm))
5522                 return -ENOSPC;
5523         return 0;
5524 }
5525
5526 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
5527                        void *insn, int insn_len)
5528 {
5529         int r, emulation_type = 0;
5530         bool direct = vcpu->arch.mmu->direct_map;
5531
5532         /* With shadow page tables, fault_address contains a GVA or nGPA.  */
5533         if (vcpu->arch.mmu->direct_map) {
5534                 vcpu->arch.gpa_available = true;
5535                 vcpu->arch.gpa_val = cr2_or_gpa;
5536         }
5537
5538         r = RET_PF_INVALID;
5539         if (unlikely(error_code & PFERR_RSVD_MASK)) {
5540                 r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
5541                 if (r == RET_PF_EMULATE)
5542                         goto emulate;
5543         }
5544
5545         if (r == RET_PF_INVALID) {
5546                 r = vcpu->arch.mmu->page_fault(vcpu, cr2_or_gpa,
5547                                                lower_32_bits(error_code),
5548                                                false);
5549                 WARN_ON(r == RET_PF_INVALID);
5550         }
5551
5552         if (r == RET_PF_RETRY)
5553                 return 1;
5554         if (r < 0)
5555                 return r;
5556
5557         /*
5558          * Before emulating the instruction, check if the error code
5559          * was due to a RO violation while translating the guest page.
5560          * This can occur when using nested virtualization with nested
5561          * paging in both guests. If true, we simply unprotect the page
5562          * and resume the guest.
5563          */
5564         if (vcpu->arch.mmu->direct_map &&
5565             (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
5566                 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
5567                 return 1;
5568         }
5569
5570         /*
5571          * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
5572          * optimistically try to just unprotect the page and let the processor
5573          * re-execute the instruction that caused the page fault.  Do not allow
5574          * retrying MMIO emulation, as it's not only pointless but could also
5575          * cause us to enter an infinite loop because the processor will keep
5576          * faulting on the non-existent MMIO address.  Retrying an instruction
5577          * from a nested guest is also pointless and dangerous as we are only
5578          * explicitly shadowing L1's page tables, i.e. unprotecting something
5579          * for L1 isn't going to magically fix whatever issue cause L2 to fail.
5580          */
5581         if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
5582                 emulation_type = EMULTYPE_ALLOW_RETRY;
5583 emulate:
5584         /*
5585          * On AMD platforms, under certain conditions insn_len may be zero on #NPF.
5586          * This can happen if a guest gets a page-fault on data access but the HW
5587          * table walker is not able to read the instruction page (e.g instruction
5588          * page is not present in memory). In those cases we simply restart the
5589          * guest, with the exception of AMD Erratum 1096 which is unrecoverable.
5590          */
5591         if (unlikely(insn && !insn_len)) {
5592                 if (!kvm_x86_ops->need_emulation_on_page_fault(vcpu))
5593                         return 1;
5594         }
5595
5596         return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
5597                                        insn_len);
5598 }
5599 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
5600
5601 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
5602 {
5603         struct kvm_mmu *mmu = vcpu->arch.mmu;
5604         int i;
5605
5606         /* INVLPG on a * non-canonical address is a NOP according to the SDM.  */
5607         if (is_noncanonical_address(gva, vcpu))
5608                 return;
5609
5610         mmu->invlpg(vcpu, gva, mmu->root_hpa);
5611
5612         /*
5613          * INVLPG is required to invalidate any global mappings for the VA,
5614          * irrespective of PCID. Since it would take us roughly similar amount
5615          * of work to determine whether any of the prev_root mappings of the VA
5616          * is marked global, or to just sync it blindly, so we might as well
5617          * just always sync it.
5618          *
5619          * Mappings not reachable via the current cr3 or the prev_roots will be
5620          * synced when switching to that cr3, so nothing needs to be done here
5621          * for them.
5622          */
5623         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5624                 if (VALID_PAGE(mmu->prev_roots[i].hpa))
5625                         mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5626
5627         kvm_x86_ops->tlb_flush_gva(vcpu, gva);
5628         ++vcpu->stat.invlpg;
5629 }
5630 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
5631
5632 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
5633 {
5634         struct kvm_mmu *mmu = vcpu->arch.mmu;
5635         bool tlb_flush = false;
5636         uint i;
5637
5638         if (pcid == kvm_get_active_pcid(vcpu)) {
5639                 mmu->invlpg(vcpu, gva, mmu->root_hpa);
5640                 tlb_flush = true;
5641         }
5642
5643         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5644                 if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
5645                     pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) {
5646                         mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5647                         tlb_flush = true;
5648                 }
5649         }
5650
5651         if (tlb_flush)
5652                 kvm_x86_ops->tlb_flush_gva(vcpu, gva);
5653
5654         ++vcpu->stat.invlpg;
5655
5656         /*
5657          * Mappings not reachable via the current cr3 or the prev_roots will be
5658          * synced when switching to that cr3, so nothing needs to be done here
5659          * for them.
5660          */
5661 }
5662 EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
5663
5664 void kvm_enable_tdp(void)
5665 {
5666         tdp_enabled = true;
5667 }
5668 EXPORT_SYMBOL_GPL(kvm_enable_tdp);
5669
5670 void kvm_disable_tdp(void)
5671 {
5672         tdp_enabled = false;
5673 }
5674 EXPORT_SYMBOL_GPL(kvm_disable_tdp);
5675
5676
5677 /* The return value indicates if tlb flush on all vcpus is needed. */
5678 typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
5679
5680 /* The caller should hold mmu-lock before calling this function. */
5681 static __always_inline bool
5682 slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
5683                         slot_level_handler fn, int start_level, int end_level,
5684                         gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
5685 {
5686         struct slot_rmap_walk_iterator iterator;
5687         bool flush = false;
5688
5689         for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
5690                         end_gfn, &iterator) {
5691                 if (iterator.rmap)
5692                         flush |= fn(kvm, iterator.rmap);
5693
5694                 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
5695                         if (flush && lock_flush_tlb) {
5696                                 kvm_flush_remote_tlbs_with_address(kvm,
5697                                                 start_gfn,
5698                                                 iterator.gfn - start_gfn + 1);
5699                                 flush = false;
5700                         }
5701                         cond_resched_lock(&kvm->mmu_lock);
5702                 }
5703         }
5704
5705         if (flush && lock_flush_tlb) {
5706                 kvm_flush_remote_tlbs_with_address(kvm, start_gfn,
5707                                                    end_gfn - start_gfn + 1);
5708                 flush = false;
5709         }
5710
5711         return flush;
5712 }
5713
5714 static __always_inline bool
5715 slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5716                   slot_level_handler fn, int start_level, int end_level,
5717                   bool lock_flush_tlb)
5718 {
5719         return slot_handle_level_range(kvm, memslot, fn, start_level,
5720                         end_level, memslot->base_gfn,
5721                         memslot->base_gfn + memslot->npages - 1,
5722                         lock_flush_tlb);
5723 }
5724
5725 static __always_inline bool
5726 slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5727                       slot_level_handler fn, bool lock_flush_tlb)
5728 {
5729         return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5730                                  PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5731 }
5732
5733 static __always_inline bool
5734 slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5735                         slot_level_handler fn, bool lock_flush_tlb)
5736 {
5737         return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
5738                                  PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5739 }
5740
5741 static __always_inline bool
5742 slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
5743                  slot_level_handler fn, bool lock_flush_tlb)
5744 {
5745         return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5746                                  PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
5747 }
5748
5749 static void free_mmu_pages(struct kvm_mmu *mmu)
5750 {
5751         free_page((unsigned long)mmu->pae_root);
5752         free_page((unsigned long)mmu->lm_root);
5753 }
5754
5755 static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
5756 {
5757         struct page *page;
5758         int i;
5759
5760         /*
5761          * When using PAE paging, the four PDPTEs are treated as 'root' pages,
5762          * while the PDP table is a per-vCPU construct that's allocated at MMU
5763          * creation.  When emulating 32-bit mode, cr3 is only 32 bits even on
5764          * x86_64.  Therefore we need to allocate the PDP table in the first
5765          * 4GB of memory, which happens to fit the DMA32 zone.  Except for
5766          * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can
5767          * skip allocating the PDP table.
5768          */
5769         if (tdp_enabled && kvm_x86_ops->get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
5770                 return 0;
5771
5772         page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
5773         if (!page)
5774                 return -ENOMEM;
5775
5776         mmu->pae_root = page_address(page);
5777         for (i = 0; i < 4; ++i)
5778                 mmu->pae_root[i] = INVALID_PAGE;
5779
5780         return 0;
5781 }
5782
5783 int kvm_mmu_create(struct kvm_vcpu *vcpu)
5784 {
5785         uint i;
5786         int ret;
5787
5788         vcpu->arch.mmu = &vcpu->arch.root_mmu;
5789         vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
5790
5791         vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
5792         vcpu->arch.root_mmu.root_cr3 = 0;
5793         vcpu->arch.root_mmu.translate_gpa = translate_gpa;
5794         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5795                 vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5796
5797         vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
5798         vcpu->arch.guest_mmu.root_cr3 = 0;
5799         vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
5800         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5801                 vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5802
5803         vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
5804
5805         ret = alloc_mmu_pages(vcpu, &vcpu->arch.guest_mmu);
5806         if (ret)
5807                 return ret;
5808
5809         ret = alloc_mmu_pages(vcpu, &vcpu->arch.root_mmu);
5810         if (ret)
5811                 goto fail_allocate_root;
5812
5813         return ret;
5814  fail_allocate_root:
5815         free_mmu_pages(&vcpu->arch.guest_mmu);
5816         return ret;
5817 }
5818
5819 #define BATCH_ZAP_PAGES 10
5820 static void kvm_zap_obsolete_pages(struct kvm *kvm)
5821 {
5822         struct kvm_mmu_page *sp, *node;
5823         int nr_zapped, batch = 0;
5824         bool unstable;
5825
5826 restart:
5827         list_for_each_entry_safe_reverse(sp, node,
5828               &kvm->arch.active_mmu_pages, link) {
5829                 /*
5830                  * No obsolete valid page exists before a newly created page
5831                  * since active_mmu_pages is a FIFO list.
5832                  */
5833                 if (!is_obsolete_sp(kvm, sp))
5834                         break;
5835
5836                 /*
5837                  * Skip invalid pages with a non-zero root count, zapping pages
5838                  * with a non-zero root count will never succeed, i.e. the page
5839                  * will get thrown back on active_mmu_pages and we'll get stuck
5840                  * in an infinite loop.
5841                  */
5842                 if (sp->role.invalid && sp->root_count)
5843                         continue;
5844
5845                 /*
5846                  * No need to flush the TLB since we're only zapping shadow
5847                  * pages with an obsolete generation number and all vCPUS have
5848                  * loaded a new root, i.e. the shadow pages being zapped cannot
5849                  * be in active use by the guest.
5850                  */
5851                 if (batch >= BATCH_ZAP_PAGES &&
5852                     cond_resched_lock(&kvm->mmu_lock)) {
5853                         batch = 0;
5854                         goto restart;
5855                 }
5856
5857                 unstable = __kvm_mmu_prepare_zap_page(kvm, sp,
5858                                 &kvm->arch.zapped_obsolete_pages, &nr_zapped);
5859                 batch += nr_zapped;
5860
5861                 if (unstable)
5862                         goto restart;
5863         }
5864
5865         /*
5866          * Trigger a remote TLB flush before freeing the page tables to ensure
5867          * KVM is not in the middle of a lockless shadow page table walk, which
5868          * may reference the pages.
5869          */
5870         kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
5871 }
5872
5873 /*
5874  * Fast invalidate all shadow pages and use lock-break technique
5875  * to zap obsolete pages.
5876  *
5877  * It's required when memslot is being deleted or VM is being
5878  * destroyed, in these cases, we should ensure that KVM MMU does
5879  * not use any resource of the being-deleted slot or all slots
5880  * after calling the function.
5881  */
5882 static void kvm_mmu_zap_all_fast(struct kvm *kvm)
5883 {
5884         lockdep_assert_held(&kvm->slots_lock);
5885
5886         spin_lock(&kvm->mmu_lock);
5887         trace_kvm_mmu_zap_all_fast(kvm);
5888
5889         /*
5890          * Toggle mmu_valid_gen between '0' and '1'.  Because slots_lock is
5891          * held for the entire duration of zapping obsolete pages, it's
5892          * impossible for there to be multiple invalid generations associated
5893          * with *valid* shadow pages at any given time, i.e. there is exactly
5894          * one valid generation and (at most) one invalid generation.
5895          */
5896         kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
5897
5898         /*
5899          * Notify all vcpus to reload its shadow page table and flush TLB.
5900          * Then all vcpus will switch to new shadow page table with the new
5901          * mmu_valid_gen.
5902          *
5903          * Note: we need to do this under the protection of mmu_lock,
5904          * otherwise, vcpu would purge shadow page but miss tlb flush.
5905          */
5906         kvm_reload_remote_mmus(kvm);
5907
5908         kvm_zap_obsolete_pages(kvm);
5909         spin_unlock(&kvm->mmu_lock);
5910 }
5911
5912 static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
5913 {
5914         return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
5915 }
5916
5917 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
5918                         struct kvm_memory_slot *slot,
5919                         struct kvm_page_track_notifier_node *node)
5920 {
5921         kvm_mmu_zap_all_fast(kvm);
5922 }
5923
5924 void kvm_mmu_init_vm(struct kvm *kvm)
5925 {
5926         struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5927
5928         node->track_write = kvm_mmu_pte_write;
5929         node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
5930         kvm_page_track_register_notifier(kvm, node);
5931 }
5932
5933 void kvm_mmu_uninit_vm(struct kvm *kvm)
5934 {
5935         struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5936
5937         kvm_page_track_unregister_notifier(kvm, node);
5938 }
5939
5940 void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
5941 {
5942         struct kvm_memslots *slots;
5943         struct kvm_memory_slot *memslot;
5944         int i;
5945
5946         spin_lock(&kvm->mmu_lock);
5947         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
5948                 slots = __kvm_memslots(kvm, i);
5949                 kvm_for_each_memslot(memslot, slots) {
5950                         gfn_t start, end;
5951
5952                         start = max(gfn_start, memslot->base_gfn);
5953                         end = min(gfn_end, memslot->base_gfn + memslot->npages);
5954                         if (start >= end)
5955                                 continue;
5956
5957                         slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
5958                                                 PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
5959                                                 start, end - 1, true);
5960                 }
5961         }
5962
5963         spin_unlock(&kvm->mmu_lock);
5964 }
5965
5966 static bool slot_rmap_write_protect(struct kvm *kvm,
5967                                     struct kvm_rmap_head *rmap_head)
5968 {
5969         return __rmap_write_protect(kvm, rmap_head, false);
5970 }
5971
5972 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
5973                                       struct kvm_memory_slot *memslot)
5974 {
5975         bool flush;
5976
5977         spin_lock(&kvm->mmu_lock);
5978         flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
5979                                       false);
5980         spin_unlock(&kvm->mmu_lock);
5981
5982         /*
5983          * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log()
5984          * which do tlb flush out of mmu-lock should be serialized by
5985          * kvm->slots_lock otherwise tlb flush would be missed.
5986          */
5987         lockdep_assert_held(&kvm->slots_lock);
5988
5989         /*
5990          * We can flush all the TLBs out of the mmu lock without TLB
5991          * corruption since we just change the spte from writable to
5992          * readonly so that we only need to care the case of changing
5993          * spte from present to present (changing the spte from present
5994          * to nonpresent will flush all the TLBs immediately), in other
5995          * words, the only case we care is mmu_spte_update() where we
5996          * have checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE
5997          * instead of PT_WRITABLE_MASK, that means it does not depend
5998          * on PT_WRITABLE_MASK anymore.
5999          */
6000         if (flush)
6001                 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
6002                         memslot->npages);
6003 }
6004
6005 static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
6006                                          struct kvm_rmap_head *rmap_head)
6007 {
6008         u64 *sptep;
6009         struct rmap_iterator iter;
6010         int need_tlb_flush = 0;
6011         kvm_pfn_t pfn;
6012         struct kvm_mmu_page *sp;
6013
6014 restart:
6015         for_each_rmap_spte(rmap_head, &iter, sptep) {
6016                 sp = page_header(__pa(sptep));
6017                 pfn = spte_to_pfn(*sptep);
6018
6019                 /*
6020                  * We cannot do huge page mapping for indirect shadow pages,
6021                  * which are found on the last rmap (level = 1) when not using
6022                  * tdp; such shadow pages are synced with the page table in
6023                  * the guest, and the guest page table is using 4K page size
6024                  * mapping if the indirect sp has level = 1.
6025                  */
6026                 if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
6027                     !kvm_is_zone_device_pfn(pfn) &&
6028                     PageTransCompoundMap(pfn_to_page(pfn))) {
6029                         pte_list_remove(rmap_head, sptep);
6030
6031                         if (kvm_available_flush_tlb_with_range())
6032                                 kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
6033                                         KVM_PAGES_PER_HPAGE(sp->role.level));
6034                         else
6035                                 need_tlb_flush = 1;
6036
6037                         goto restart;
6038                 }
6039         }
6040
6041         return need_tlb_flush;
6042 }
6043
6044 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
6045                                    const struct kvm_memory_slot *memslot)
6046 {
6047         /* FIXME: const-ify all uses of struct kvm_memory_slot.  */
6048         spin_lock(&kvm->mmu_lock);
6049         slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
6050                          kvm_mmu_zap_collapsible_spte, true);
6051         spin_unlock(&kvm->mmu_lock);
6052 }
6053
6054 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
6055                                    struct kvm_memory_slot *memslot)
6056 {
6057         bool flush;
6058
6059         spin_lock(&kvm->mmu_lock);
6060         flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
6061         spin_unlock(&kvm->mmu_lock);
6062
6063         lockdep_assert_held(&kvm->slots_lock);
6064
6065         /*
6066          * It's also safe to flush TLBs out of mmu lock here as currently this
6067          * function is only used for dirty logging, in which case flushing TLB
6068          * out of mmu lock also guarantees no dirty pages will be lost in
6069          * dirty_bitmap.
6070          */
6071         if (flush)
6072                 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
6073                                 memslot->npages);
6074 }
6075 EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
6076
6077 void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
6078                                         struct kvm_memory_slot *memslot)
6079 {
6080         bool flush;
6081
6082         spin_lock(&kvm->mmu_lock);
6083         flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
6084                                         false);
6085         spin_unlock(&kvm->mmu_lock);
6086
6087         /* see kvm_mmu_slot_remove_write_access */
6088         lockdep_assert_held(&kvm->slots_lock);
6089
6090         if (flush)
6091                 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
6092                                 memslot->npages);
6093 }
6094 EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
6095
6096 void kvm_mmu_slot_set_dirty(struct kvm *kvm,
6097                             struct kvm_memory_slot *memslot)
6098 {
6099         bool flush;
6100
6101         spin_lock(&kvm->mmu_lock);
6102         flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
6103         spin_unlock(&kvm->mmu_lock);
6104
6105         lockdep_assert_held(&kvm->slots_lock);
6106
6107         /* see kvm_mmu_slot_leaf_clear_dirty */
6108         if (flush)
6109                 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
6110                                 memslot->npages);
6111 }
6112 EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
6113
6114 void kvm_mmu_zap_all(struct kvm *kvm)
6115 {
6116         struct kvm_mmu_page *sp, *node;
6117         LIST_HEAD(invalid_list);
6118         int ign;
6119
6120         spin_lock(&kvm->mmu_lock);
6121 restart:
6122         list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
6123                 if (sp->role.invalid && sp->root_count)
6124                         continue;
6125                 if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
6126                         goto restart;
6127                 if (cond_resched_lock(&kvm->mmu_lock))
6128                         goto restart;
6129         }
6130
6131         kvm_mmu_commit_zap_page(kvm, &invalid_list);
6132         spin_unlock(&kvm->mmu_lock);
6133 }
6134
6135 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
6136 {
6137         WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
6138
6139         gen &= MMIO_SPTE_GEN_MASK;
6140
6141         /*
6142          * Generation numbers are incremented in multiples of the number of
6143          * address spaces in order to provide unique generations across all
6144          * address spaces.  Strip what is effectively the address space
6145          * modifier prior to checking for a wrap of the MMIO generation so
6146          * that a wrap in any address space is detected.
6147          */
6148         gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
6149
6150         /*
6151          * The very rare case: if the MMIO generation number has wrapped,
6152          * zap all shadow pages.
6153          */
6154         if (unlikely(gen == 0)) {
6155                 kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
6156                 kvm_mmu_zap_all_fast(kvm);
6157         }
6158 }
6159
6160 static unsigned long
6161 mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
6162 {
6163         struct kvm *kvm;
6164         int nr_to_scan = sc->nr_to_scan;
6165         unsigned long freed = 0;
6166
6167         mutex_lock(&kvm_lock);
6168
6169         list_for_each_entry(kvm, &vm_list, vm_list) {
6170                 int idx;
6171                 LIST_HEAD(invalid_list);
6172
6173                 /*
6174                  * Never scan more than sc->nr_to_scan VM instances.
6175                  * Will not hit this condition practically since we do not try
6176                  * to shrink more than one VM and it is very unlikely to see
6177                  * !n_used_mmu_pages so many times.
6178                  */
6179                 if (!nr_to_scan--)
6180                         break;
6181                 /*
6182                  * n_used_mmu_pages is accessed without holding kvm->mmu_lock
6183                  * here. We may skip a VM instance errorneosly, but we do not
6184                  * want to shrink a VM that only started to populate its MMU
6185                  * anyway.
6186                  */
6187                 if (!kvm->arch.n_used_mmu_pages &&
6188                     !kvm_has_zapped_obsolete_pages(kvm))
6189                         continue;
6190
6191                 idx = srcu_read_lock(&kvm->srcu);
6192                 spin_lock(&kvm->mmu_lock);
6193
6194                 if (kvm_has_zapped_obsolete_pages(kvm)) {
6195                         kvm_mmu_commit_zap_page(kvm,
6196                               &kvm->arch.zapped_obsolete_pages);
6197                         goto unlock;
6198                 }
6199
6200                 if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
6201                         freed++;
6202                 kvm_mmu_commit_zap_page(kvm, &invalid_list);
6203
6204 unlock:
6205                 spin_unlock(&kvm->mmu_lock);
6206                 srcu_read_unlock(&kvm->srcu, idx);
6207
6208                 /*
6209                  * unfair on small ones
6210                  * per-vm shrinkers cry out
6211                  * sadness comes quickly
6212                  */
6213                 list_move_tail(&kvm->vm_list, &vm_list);
6214                 break;
6215         }
6216
6217         mutex_unlock(&kvm_lock);
6218         return freed;
6219 }
6220
6221 static unsigned long
6222 mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
6223 {
6224         return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
6225 }
6226
6227 static struct shrinker mmu_shrinker = {
6228         .count_objects = mmu_shrink_count,
6229         .scan_objects = mmu_shrink_scan,
6230         .seeks = DEFAULT_SEEKS * 10,
6231 };
6232
6233 static void mmu_destroy_caches(void)
6234 {
6235         kmem_cache_destroy(pte_list_desc_cache);
6236         kmem_cache_destroy(mmu_page_header_cache);
6237 }
6238
6239 static void kvm_set_mmio_spte_mask(void)
6240 {
6241         u64 mask;
6242
6243         /*
6244          * Set a reserved PA bit in MMIO SPTEs to generate page faults with
6245          * PFEC.RSVD=1 on MMIO accesses.  64-bit PTEs (PAE, x86-64, and EPT
6246          * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
6247          * 52-bit physical addresses then there are no reserved PA bits in the
6248          * PTEs and so the reserved PA approach must be disabled.
6249          */
6250         if (shadow_phys_bits < 52)
6251                 mask = BIT_ULL(51) | PT_PRESENT_MASK;
6252         else
6253                 mask = 0;
6254
6255         kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
6256 }
6257
6258 static bool get_nx_auto_mode(void)
6259 {
6260         /* Return true when CPU has the bug, and mitigations are ON */
6261         return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
6262 }
6263
6264 static void __set_nx_huge_pages(bool val)
6265 {
6266         nx_huge_pages = itlb_multihit_kvm_mitigation = val;
6267 }
6268
6269 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
6270 {
6271         bool old_val = nx_huge_pages;
6272         bool new_val;
6273
6274         /* In "auto" mode deploy workaround only if CPU has the bug. */
6275         if (sysfs_streq(val, "off"))
6276                 new_val = 0;
6277         else if (sysfs_streq(val, "force"))
6278                 new_val = 1;
6279         else if (sysfs_streq(val, "auto"))
6280                 new_val = get_nx_auto_mode();
6281         else if (strtobool(val, &new_val) < 0)
6282                 return -EINVAL;
6283
6284         __set_nx_huge_pages(new_val);
6285
6286         if (new_val != old_val) {
6287                 struct kvm *kvm;
6288
6289                 mutex_lock(&kvm_lock);
6290
6291                 list_for_each_entry(kvm, &vm_list, vm_list) {
6292                         mutex_lock(&kvm->slots_lock);
6293                         kvm_mmu_zap_all_fast(kvm);
6294                         mutex_unlock(&kvm->slots_lock);
6295
6296                         wake_up_process(kvm->arch.nx_lpage_recovery_thread);
6297                 }
6298                 mutex_unlock(&kvm_lock);
6299         }
6300
6301         return 0;
6302 }
6303
6304 int kvm_mmu_module_init(void)
6305 {
6306         int ret = -ENOMEM;
6307
6308         if (nx_huge_pages == -1)
6309                 __set_nx_huge_pages(get_nx_auto_mode());
6310
6311         /*
6312          * MMU roles use union aliasing which is, generally speaking, an
6313          * undefined behavior. However, we supposedly know how compilers behave
6314          * and the current status quo is unlikely to change. Guardians below are
6315          * supposed to let us know if the assumption becomes false.
6316          */
6317         BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
6318         BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
6319         BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64));
6320
6321         kvm_mmu_reset_all_pte_masks();
6322
6323         kvm_set_mmio_spte_mask();
6324
6325         pte_list_desc_cache = kmem_cache_create("pte_list_desc",
6326                                             sizeof(struct pte_list_desc),
6327                                             0, SLAB_ACCOUNT, NULL);
6328         if (!pte_list_desc_cache)
6329                 goto out;
6330
6331         mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
6332                                                   sizeof(struct kvm_mmu_page),
6333                                                   0, SLAB_ACCOUNT, NULL);
6334         if (!mmu_page_header_cache)
6335                 goto out;
6336
6337         if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
6338                 goto out;
6339
6340         ret = register_shrinker(&mmu_shrinker);
6341         if (ret)
6342                 goto out;
6343
6344         return 0;
6345
6346 out:
6347         mmu_destroy_caches();
6348         return ret;
6349 }
6350
6351 /*
6352  * Calculate mmu pages needed for kvm.
6353  */
6354 unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
6355 {
6356         unsigned long nr_mmu_pages;
6357         unsigned long nr_pages = 0;
6358         struct kvm_memslots *slots;
6359         struct kvm_memory_slot *memslot;
6360         int i;
6361
6362         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
6363                 slots = __kvm_memslots(kvm, i);
6364
6365                 kvm_for_each_memslot(memslot, slots)
6366                         nr_pages += memslot->npages;
6367         }
6368
6369         nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
6370         nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
6371
6372         return nr_mmu_pages;
6373 }
6374
6375 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
6376 {
6377         kvm_mmu_unload(vcpu);
6378         free_mmu_pages(&vcpu->arch.root_mmu);
6379         free_mmu_pages(&vcpu->arch.guest_mmu);
6380         mmu_free_memory_caches(vcpu);
6381 }
6382
6383 void kvm_mmu_module_exit(void)
6384 {
6385         mmu_destroy_caches();
6386         percpu_counter_destroy(&kvm_total_used_mmu_pages);
6387         unregister_shrinker(&mmu_shrinker);
6388         mmu_audit_disable();
6389 }
6390
6391 static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp)
6392 {
6393         unsigned int old_val;
6394         int err;
6395
6396         old_val = nx_huge_pages_recovery_ratio;
6397         err = param_set_uint(val, kp);
6398         if (err)
6399                 return err;
6400
6401         if (READ_ONCE(nx_huge_pages) &&
6402             !old_val && nx_huge_pages_recovery_ratio) {
6403                 struct kvm *kvm;
6404
6405                 mutex_lock(&kvm_lock);
6406
6407                 list_for_each_entry(kvm, &vm_list, vm_list)
6408                         wake_up_process(kvm->arch.nx_lpage_recovery_thread);
6409
6410                 mutex_unlock(&kvm_lock);
6411         }
6412
6413         return err;
6414 }
6415
6416 static void kvm_recover_nx_lpages(struct kvm *kvm)
6417 {
6418         int rcu_idx;
6419         struct kvm_mmu_page *sp;
6420         unsigned int ratio;
6421         LIST_HEAD(invalid_list);
6422         ulong to_zap;
6423
6424         rcu_idx = srcu_read_lock(&kvm->srcu);
6425         spin_lock(&kvm->mmu_lock);
6426
6427         ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
6428         to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
6429         while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) {
6430                 /*
6431                  * We use a separate list instead of just using active_mmu_pages
6432                  * because the number of lpage_disallowed pages is expected to
6433                  * be relatively small compared to the total.
6434                  */
6435                 sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
6436                                       struct kvm_mmu_page,
6437                                       lpage_disallowed_link);
6438                 WARN_ON_ONCE(!sp->lpage_disallowed);
6439                 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
6440                 WARN_ON_ONCE(sp->lpage_disallowed);
6441
6442                 if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) {
6443                         kvm_mmu_commit_zap_page(kvm, &invalid_list);
6444                         if (to_zap)
6445                                 cond_resched_lock(&kvm->mmu_lock);
6446                 }
6447         }
6448         kvm_mmu_commit_zap_page(kvm, &invalid_list);
6449
6450         spin_unlock(&kvm->mmu_lock);
6451         srcu_read_unlock(&kvm->srcu, rcu_idx);
6452 }
6453
6454 static long get_nx_lpage_recovery_timeout(u64 start_time)
6455 {
6456         return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio)
6457                 ? start_time + 60 * HZ - get_jiffies_64()
6458                 : MAX_SCHEDULE_TIMEOUT;
6459 }
6460
6461 static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
6462 {
6463         u64 start_time;
6464         long remaining_time;
6465
6466         while (true) {
6467                 start_time = get_jiffies_64();
6468                 remaining_time = get_nx_lpage_recovery_timeout(start_time);
6469
6470                 set_current_state(TASK_INTERRUPTIBLE);
6471                 while (!kthread_should_stop() && remaining_time > 0) {
6472                         schedule_timeout(remaining_time);
6473                         remaining_time = get_nx_lpage_recovery_timeout(start_time);
6474                         set_current_state(TASK_INTERRUPTIBLE);
6475                 }
6476
6477                 set_current_state(TASK_RUNNING);
6478
6479                 if (kthread_should_stop())
6480                         return 0;
6481
6482                 kvm_recover_nx_lpages(kvm);
6483         }
6484 }
6485
6486 int kvm_mmu_post_init_vm(struct kvm *kvm)
6487 {
6488         int err;
6489
6490         err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
6491                                           "kvm-nx-lpage-recovery",
6492                                           &kvm->arch.nx_lpage_recovery_thread);
6493         if (!err)
6494                 kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
6495
6496         return err;
6497 }
6498
6499 void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
6500 {
6501         if (kvm->arch.nx_lpage_recovery_thread)
6502                 kthread_stop(kvm->arch.nx_lpage_recovery_thread);
6503 }