1 // SPDX-License-Identifier: GPL-2.0
4 #include "mmu_internal.h"
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
13 static bool __read_mostly tdp_mmu_enabled = false;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
16 /* Initializes the TDP MMU for the VM, if enabled. */
17 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
19 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
22 /* This should not be changed for the lifetime of the VM. */
23 kvm->arch.tdp_mmu_enabled = true;
25 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
26 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
27 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
32 static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
36 lockdep_assert_held_read(&kvm->mmu_lock);
38 lockdep_assert_held_write(&kvm->mmu_lock);
41 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
43 if (!kvm->arch.tdp_mmu_enabled)
46 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
47 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
50 * Ensure that all the outstanding RCU callbacks to free shadow pages
51 * can run before the VM is torn down.
56 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
57 gfn_t start, gfn_t end, bool can_yield, bool flush,
60 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
62 free_page((unsigned long)sp->spt);
63 kmem_cache_free(mmu_page_header_cache, sp);
67 * This is called through call_rcu in order to free TDP page table memory
68 * safely with respect to other kernel threads that may be operating on
70 * By only accessing TDP MMU page table memory in an RCU read critical
71 * section, and freeing it after a grace period, lockless access to that
72 * memory won't use it after it is freed.
74 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
76 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
82 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
85 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
87 if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
90 WARN_ON(!root->tdp_mmu_page);
92 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
93 list_del_rcu(&root->link);
94 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
96 zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
98 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
102 * Returns the next root after @prev_root (or the first root if @prev_root is
103 * NULL). A reference to the returned root is acquired, and the reference to
104 * @prev_root is released (the caller obviously must hold a reference to
105 * @prev_root if it's non-NULL).
107 * If @only_valid is true, invalid roots are skipped.
109 * Returns NULL if the end of tdp_mmu_roots was reached.
111 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
112 struct kvm_mmu_page *prev_root,
113 bool shared, bool only_valid)
115 struct kvm_mmu_page *next_root;
120 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
122 typeof(*prev_root), link);
124 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
125 typeof(*next_root), link);
128 if ((!only_valid || !next_root->role.invalid) &&
129 kvm_tdp_mmu_get_root(kvm, next_root))
132 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
133 &next_root->link, typeof(*next_root), link);
139 kvm_tdp_mmu_put_root(kvm, prev_root, shared);
145 * Note: this iterator gets and puts references to the roots it iterates over.
146 * This makes it safe to release the MMU lock and yield within the loop, but
147 * if exiting the loop early, the caller must drop the reference to the most
148 * recent root. (Unless keeping a live reference is desirable.)
150 * If shared is set, this function is operating under the MMU lock in read
151 * mode. In the unlikely event that this thread must free a root, the lock
152 * will be temporarily dropped and reacquired in write mode.
154 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
155 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \
157 _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \
158 if (kvm_mmu_page_as_id(_root) != _as_id) { \
161 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
162 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
164 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \
165 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, false)
167 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \
168 list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, \
169 lockdep_is_held_type(&kvm->mmu_lock, 0) || \
170 lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock)) \
171 if (kvm_mmu_page_as_id(_root) != _as_id) { \
174 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
177 union kvm_mmu_page_role role;
179 role = vcpu->arch.mmu->mmu_role.base;
182 role.gpte_is_8_bytes = true;
183 role.access = ACC_ALL;
188 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
191 struct kvm_mmu_page *sp;
193 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
194 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
195 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
197 sp->role.word = page_role_for_level(vcpu, level).word;
199 sp->tdp_mmu_page = true;
201 trace_kvm_mmu_get_page(sp, true);
206 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
208 union kvm_mmu_page_role role;
209 struct kvm *kvm = vcpu->kvm;
210 struct kvm_mmu_page *root;
212 lockdep_assert_held_write(&kvm->mmu_lock);
214 role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
217 * Check for an existing root before allocating a new one. Note, the
218 * role check prevents consuming an invalid root.
220 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
221 if (root->role.word == role.word &&
222 kvm_tdp_mmu_get_root(kvm, root))
226 root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
227 refcount_set(&root->tdp_mmu_root_count, 1);
229 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
230 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
231 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
234 return __pa(root->spt);
237 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
238 u64 old_spte, u64 new_spte, int level,
241 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
243 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
246 if (is_accessed_spte(old_spte) &&
247 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
248 spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
249 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
252 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
253 u64 old_spte, u64 new_spte, int level)
256 struct kvm_memory_slot *slot;
258 if (level > PG_LEVEL_4K)
261 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
263 if ((!is_writable_pte(old_spte) || pfn_changed) &&
264 is_writable_pte(new_spte)) {
265 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
266 mark_page_dirty_in_slot(kvm, slot, gfn);
271 * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
275 * @account_nx: This page replaces a NX large page and should be marked for
278 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
281 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
282 list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
284 account_huge_nx_page(kvm, sp);
285 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
289 * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
292 * @sp: the page to be removed
293 * @shared: This operation may not be running under the exclusive use of
294 * the MMU lock and the operation must synchronize with other
295 * threads that might be adding or removing pages.
297 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
301 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
303 lockdep_assert_held_write(&kvm->mmu_lock);
306 if (sp->lpage_disallowed)
307 unaccount_huge_nx_page(kvm, sp);
310 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
314 * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
317 * @pt: the page removed from the paging structure
318 * @shared: This operation may not be running under the exclusive use
319 * of the MMU lock and the operation must synchronize with other
320 * threads that might be modifying SPTEs.
322 * Given a page table that has been removed from the TDP paging structure,
323 * iterates through the page table to clear SPTEs and free child page tables.
325 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
326 * protection. Since this thread removed it from the paging structure,
327 * this thread will be responsible for ensuring the page is freed. Hence the
328 * early rcu_dereferences in the function.
330 static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
333 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
334 int level = sp->role.level;
335 gfn_t base_gfn = sp->gfn;
338 trace_kvm_mmu_prepare_zap_page(sp);
340 tdp_mmu_unlink_page(kvm, sp, shared);
342 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
343 u64 *sptep = rcu_dereference(pt) + i;
344 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
349 * Set the SPTE to a nonpresent value that other
350 * threads will not overwrite. If the SPTE was
351 * already marked as removed then another thread
352 * handling a page fault could overwrite it, so
353 * set the SPTE until it is set from some other
354 * value to the removed SPTE value.
357 old_child_spte = xchg(sptep, REMOVED_SPTE);
358 if (!is_removed_spte(old_child_spte))
364 * If the SPTE is not MMU-present, there is no backing
365 * page associated with the SPTE and so no side effects
366 * that need to be recorded, and exclusive ownership of
367 * mmu_lock ensures the SPTE can't be made present.
368 * Note, zapping MMIO SPTEs is also unnecessary as they
369 * are guarded by the memslots generation, not by being
372 old_child_spte = READ_ONCE(*sptep);
373 if (!is_shadow_present_pte(old_child_spte))
377 * Marking the SPTE as a removed SPTE is not
378 * strictly necessary here as the MMU lock will
379 * stop other threads from concurrently modifying
380 * this SPTE. Using the removed SPTE value keeps
381 * the two branches consistent and simplifies
384 WRITE_ONCE(*sptep, REMOVED_SPTE);
386 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
387 old_child_spte, REMOVED_SPTE, level,
391 kvm_flush_remote_tlbs_with_address(kvm, base_gfn,
392 KVM_PAGES_PER_HPAGE(level + 1));
394 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
398 * __handle_changed_spte - handle bookkeeping associated with an SPTE change
400 * @as_id: the address space of the paging structure the SPTE was a part of
401 * @gfn: the base GFN that was mapped by the SPTE
402 * @old_spte: The value of the SPTE before the change
403 * @new_spte: The value of the SPTE after the change
404 * @level: the level of the PT the SPTE is part of in the paging structure
405 * @shared: This operation may not be running under the exclusive use of
406 * the MMU lock and the operation must synchronize with other
407 * threads that might be modifying SPTEs.
409 * Handle bookkeeping that might result from the modification of a SPTE.
410 * This function must be called for all TDP SPTE modifications.
412 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
413 u64 old_spte, u64 new_spte, int level,
416 bool was_present = is_shadow_present_pte(old_spte);
417 bool is_present = is_shadow_present_pte(new_spte);
418 bool was_leaf = was_present && is_last_spte(old_spte, level);
419 bool is_leaf = is_present && is_last_spte(new_spte, level);
420 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
422 WARN_ON(level > PT64_ROOT_MAX_LEVEL);
423 WARN_ON(level < PG_LEVEL_4K);
424 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
427 * If this warning were to trigger it would indicate that there was a
428 * missing MMU notifier or a race with some notifier handler.
429 * A present, leaf SPTE should never be directly replaced with another
430 * present leaf SPTE pointing to a different PFN. A notifier handler
431 * should be zapping the SPTE before the main MM's page table is
432 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
433 * thread before replacement.
435 if (was_leaf && is_leaf && pfn_changed) {
436 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
437 "SPTE with another present leaf SPTE mapping a\n"
439 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
440 as_id, gfn, old_spte, new_spte, level);
443 * Crash the host to prevent error propagation and guest data
449 if (old_spte == new_spte)
452 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
455 * The only times a SPTE should be changed from a non-present to
456 * non-present state is when an MMIO entry is installed/modified/
457 * removed. In that case, there is nothing to do here.
459 if (!was_present && !is_present) {
461 * If this change does not involve a MMIO SPTE or removed SPTE,
462 * it is unexpected. Log the change, though it should not
463 * impact the guest since both the former and current SPTEs
466 if (WARN_ON(!is_mmio_spte(old_spte) &&
467 !is_mmio_spte(new_spte) &&
468 !is_removed_spte(new_spte)))
469 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
470 "should not be replaced with another,\n"
471 "different nonpresent SPTE, unless one or both\n"
472 "are MMIO SPTEs, or the new SPTE is\n"
473 "a temporary removed SPTE.\n"
474 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
475 as_id, gfn, old_spte, new_spte, level);
479 if (is_leaf != was_leaf)
480 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
482 if (was_leaf && is_dirty_spte(old_spte) &&
483 (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
484 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
487 * Recursively handle child PTs if the change removed a subtree from
488 * the paging structure.
490 if (was_present && !was_leaf && (pfn_changed || !is_present))
491 handle_removed_tdp_mmu_page(kvm,
492 spte_to_child_pt(old_spte, level), shared);
495 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
496 u64 old_spte, u64 new_spte, int level,
499 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
501 handle_changed_spte_acc_track(old_spte, new_spte, level);
502 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
507 * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically
508 * and handle the associated bookkeeping, but do not mark the page dirty
509 * in KVM's dirty bitmaps.
512 * @iter: a tdp_iter instance currently on the SPTE that should be set
513 * @new_spte: The value the SPTE should be set to
514 * Returns: true if the SPTE was set, false if it was not. If false is returned,
515 * this function will have no side-effects.
517 static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
518 struct tdp_iter *iter,
521 WARN_ON_ONCE(iter->yielded);
523 lockdep_assert_held_read(&kvm->mmu_lock);
526 * Do not change removed SPTEs. Only the thread that froze the SPTE
529 if (is_removed_spte(iter->old_spte))
533 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
534 * does not hold the mmu_lock.
536 if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
537 new_spte) != iter->old_spte)
540 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
541 new_spte, iter->level, true);
542 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
548 * tdp_mmu_map_set_spte_atomic - Set a leaf TDP MMU SPTE atomically to resolve a
551 * @vcpu: The vcpu instance that took the TDP page fault.
552 * @iter: a tdp_iter instance currently on the SPTE that should be set
553 * @new_spte: The value the SPTE should be set to
555 * Returns: true if the SPTE was set, false if it was not. If false is returned,
556 * this function will have no side-effects.
558 static inline bool tdp_mmu_map_set_spte_atomic(struct kvm_vcpu *vcpu,
559 struct tdp_iter *iter,
562 struct kvm *kvm = vcpu->kvm;
564 if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte))
568 * Use kvm_vcpu_gfn_to_memslot() instead of going through
569 * handle_changed_spte_dirty_log() to leverage vcpu->last_used_slot.
571 if (is_writable_pte(new_spte)) {
572 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, iter->gfn);
574 if (slot && kvm_slot_dirty_track_enabled(slot)) {
575 /* Enforced by kvm_mmu_hugepage_adjust. */
576 WARN_ON_ONCE(iter->level > PG_LEVEL_4K);
577 mark_page_dirty_in_slot(kvm, slot, iter->gfn);
584 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
585 struct tdp_iter *iter)
588 * Freeze the SPTE by setting it to a special,
589 * non-present value. This will stop other threads from
590 * immediately installing a present entry in its place
591 * before the TLBs are flushed.
593 if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, REMOVED_SPTE))
596 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
597 KVM_PAGES_PER_HPAGE(iter->level));
600 * No other thread can overwrite the removed SPTE as they
601 * must either wait on the MMU lock or use
602 * tdp_mmu_set_spte_atomic which will not overwrite the
603 * special removed SPTE value. No bookkeeping is needed
604 * here since the SPTE is going from non-present
607 WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
614 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
616 * @iter: a tdp_iter instance currently on the SPTE that should be set
617 * @new_spte: The value the SPTE should be set to
618 * @record_acc_track: Notify the MM subsystem of changes to the accessed state
619 * of the page. Should be set unless handling an MMU
620 * notifier for access tracking. Leaving record_acc_track
621 * unset in that case prevents page accesses from being
623 * @record_dirty_log: Record the page as dirty in the dirty bitmap if
624 * appropriate for the change being made. Should be set
625 * unless performing certain dirty logging operations.
626 * Leaving record_dirty_log unset in that case prevents page
627 * writes from being double counted.
629 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
630 u64 new_spte, bool record_acc_track,
631 bool record_dirty_log)
633 WARN_ON_ONCE(iter->yielded);
635 lockdep_assert_held_write(&kvm->mmu_lock);
638 * No thread should be using this function to set SPTEs to the
639 * temporary removed SPTE value.
640 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
641 * should be used. If operating under the MMU lock in write mode, the
642 * use of the removed SPTE should not be necessary.
644 WARN_ON(is_removed_spte(iter->old_spte));
646 WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
648 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
649 new_spte, iter->level, false);
650 if (record_acc_track)
651 handle_changed_spte_acc_track(iter->old_spte, new_spte,
653 if (record_dirty_log)
654 handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
655 iter->old_spte, new_spte,
659 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
662 __tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
665 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
666 struct tdp_iter *iter,
669 __tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
672 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
673 struct tdp_iter *iter,
676 __tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
679 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
680 for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
682 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
683 tdp_root_for_each_pte(_iter, _root, _start, _end) \
684 if (!is_shadow_present_pte(_iter.old_spte) || \
685 !is_last_spte(_iter.old_spte, _iter.level)) \
689 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \
690 for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \
691 _mmu->shadow_root_level, _start, _end)
694 * Yield if the MMU lock is contended or this thread needs to return control
697 * If this function should yield and flush is set, it will perform a remote
698 * TLB flush before yielding.
700 * If this function yields, iter->yielded is set and the caller must skip to
701 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
702 * over the paging structures to allow the iterator to continue its traversal
703 * from the paging structure root.
705 * Returns true if this function yielded.
707 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
708 struct tdp_iter *iter,
709 bool flush, bool shared)
711 WARN_ON(iter->yielded);
713 /* Ensure forward progress has been made before yielding. */
714 if (iter->next_last_level_gfn == iter->yielded_gfn)
717 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
721 kvm_flush_remote_tlbs(kvm);
724 cond_resched_rwlock_read(&kvm->mmu_lock);
726 cond_resched_rwlock_write(&kvm->mmu_lock);
730 WARN_ON(iter->gfn > iter->next_last_level_gfn);
732 iter->yielded = true;
735 return iter->yielded;
739 * Tears down the mappings for the range of gfns, [start, end), and frees the
740 * non-root pages mapping GFNs strictly within that range. Returns true if
741 * SPTEs have been cleared and a TLB flush is needed before releasing the
744 * If can_yield is true, will release the MMU lock and reschedule if the
745 * scheduler needs the CPU or there is contention on the MMU lock. If this
746 * function cannot yield, it will not release the MMU lock or reschedule and
747 * the caller must ensure it does not supply too large a GFN range, or the
748 * operation can cause a soft lockup.
750 * If shared is true, this thread holds the MMU lock in read mode and must
751 * account for the possibility that other threads are modifying the paging
752 * structures concurrently. If shared is false, this thread should hold the
753 * MMU lock in write mode.
755 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
756 gfn_t start, gfn_t end, bool can_yield, bool flush,
759 gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
760 bool zap_all = (start == 0 && end >= max_gfn_host);
761 struct tdp_iter iter;
764 * No need to try to step down in the iterator when zapping all SPTEs,
765 * zapping the top-level non-leaf SPTEs will recurse on their children.
767 int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
770 * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
771 * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
772 * and so KVM will never install a SPTE for such addresses.
774 end = min(end, max_gfn_host);
776 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
780 for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
781 min_level, start, end) {
784 tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
789 if (!is_shadow_present_pte(iter.old_spte))
793 * If this is a non-last-level SPTE that covers a larger range
794 * than should be zapped, continue, and zap the mappings at a
795 * lower level, except when zapping all SPTEs.
799 iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
800 !is_last_spte(iter.old_spte, iter.level))
804 tdp_mmu_set_spte(kvm, &iter, 0);
806 } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
808 * The iter must explicitly re-read the SPTE because
809 * the atomic cmpxchg failed.
811 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
821 * Tears down the mappings for the range of gfns, [start, end), and frees the
822 * non-root pages mapping GFNs strictly within that range. Returns true if
823 * SPTEs have been cleared and a TLB flush is needed before releasing the
826 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
827 gfn_t end, bool can_yield, bool flush)
829 struct kvm_mmu_page *root;
831 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false)
832 flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
838 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
843 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
844 flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush);
847 kvm_flush_remote_tlbs(kvm);
850 static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
851 struct kvm_mmu_page *prev_root)
853 struct kvm_mmu_page *next_root;
856 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
858 typeof(*prev_root), link);
860 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
861 typeof(*next_root), link);
863 while (next_root && !(next_root->role.invalid &&
864 refcount_read(&next_root->tdp_mmu_root_count)))
865 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
867 typeof(*next_root), link);
873 * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
874 * invalidated root, they will not be freed until this function drops the
875 * reference. Before dropping that reference, tear down the paging
876 * structure so that whichever thread does drop the last reference
877 * only has to do a trivial amount of work. Since the roots are invalid,
878 * no new SPTEs should be created under them.
880 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
882 struct kvm_mmu_page *next_root;
883 struct kvm_mmu_page *root;
886 lockdep_assert_held_read(&kvm->mmu_lock);
890 root = next_invalidated_root(kvm, NULL);
893 next_root = next_invalidated_root(kvm, root);
897 flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true);
900 * Put the reference acquired in
901 * kvm_tdp_mmu_invalidate_roots
903 kvm_tdp_mmu_put_root(kvm, root, true);
913 kvm_flush_remote_tlbs(kvm);
917 * Mark each TDP MMU root as invalid so that other threads
918 * will drop their references and allow the root count to
921 * Also take a reference on all roots so that this thread
922 * can do the bulk of the work required to free the roots
923 * once they are invalidated. Without this reference, a
924 * vCPU thread might drop the last reference to a root and
925 * get stuck with tearing down the entire paging structure.
927 * Roots which have a zero refcount should be skipped as
928 * they're already being torn down.
929 * Already invalid roots should be referenced again so that
930 * they aren't freed before kvm_tdp_mmu_zap_all_fast is
933 * This has essentially the same effect for the TDP MMU
934 * as updating mmu_valid_gen does for the shadow MMU.
936 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
938 struct kvm_mmu_page *root;
940 lockdep_assert_held_write(&kvm->mmu_lock);
941 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
942 if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
943 root->role.invalid = true;
947 * Installs a last-level SPTE to handle a TDP page fault.
948 * (NPT/EPT violation/misconfiguration)
950 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
952 struct tdp_iter *iter,
953 kvm_pfn_t pfn, bool prefault)
956 int ret = RET_PF_FIXED;
957 int make_spte_ret = 0;
959 if (unlikely(is_noslot_pfn(pfn)))
960 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
962 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn,
963 pfn, iter->old_spte, prefault, true,
964 map_writable, !shadow_accessed_mask,
967 if (new_spte == iter->old_spte)
968 ret = RET_PF_SPURIOUS;
969 else if (!tdp_mmu_map_set_spte_atomic(vcpu, iter, new_spte))
973 * If the page fault was caused by a write but the page is write
974 * protected, emulation is needed. If the emulation was skipped,
975 * the vCPU would have the same fault again.
977 if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
979 ret = RET_PF_EMULATE;
980 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
983 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
984 if (unlikely(is_mmio_spte(new_spte))) {
985 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
987 ret = RET_PF_EMULATE;
989 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
990 rcu_dereference(iter->sptep));
994 * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
995 * consistent with legacy MMU behavior.
997 if (ret != RET_PF_SPURIOUS)
998 vcpu->stat.pf_fixed++;
1004 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1005 * page tables and SPTEs to translate the faulting guest physical address.
1007 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
1008 int map_writable, int max_level, kvm_pfn_t pfn,
1011 bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled();
1012 bool write = error_code & PFERR_WRITE_MASK;
1013 bool exec = error_code & PFERR_FETCH_MASK;
1014 bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
1015 struct kvm_mmu *mmu = vcpu->arch.mmu;
1016 struct tdp_iter iter;
1017 struct kvm_mmu_page *sp;
1021 gfn_t gfn = gpa >> PAGE_SHIFT;
1025 level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn,
1026 huge_page_disallowed, &req_level);
1028 trace_kvm_mmu_spte_requested(gpa, level, pfn);
1032 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1033 if (nx_huge_page_workaround_enabled)
1034 disallowed_hugepage_adjust(iter.old_spte, gfn,
1035 iter.level, &pfn, &level);
1037 if (iter.level == level)
1041 * If there is an SPTE mapping a large page at a higher level
1042 * than the target, that SPTE must be cleared and replaced
1043 * with a non-leaf SPTE.
1045 if (is_shadow_present_pte(iter.old_spte) &&
1046 is_large_pte(iter.old_spte)) {
1047 if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
1051 * The iter must explicitly re-read the spte here
1052 * because the new value informs the !present
1055 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1058 if (!is_shadow_present_pte(iter.old_spte)) {
1060 * If SPTE has been frozen by another thread, just
1061 * give up and retry, avoiding unnecessary page table
1062 * allocation and free.
1064 if (is_removed_spte(iter.old_spte))
1067 sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1);
1070 new_spte = make_nonleaf_spte(child_pt,
1071 !shadow_accessed_mask);
1073 if (tdp_mmu_set_spte_atomic_no_dirty_log(vcpu->kvm, &iter, new_spte)) {
1074 tdp_mmu_link_page(vcpu->kvm, sp,
1075 huge_page_disallowed &&
1076 req_level >= iter.level);
1078 trace_kvm_mmu_get_page(sp, true);
1080 tdp_mmu_free_sp(sp);
1086 if (iter.level != level) {
1088 return RET_PF_RETRY;
1091 ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter,
1098 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1101 return __kvm_tdp_mmu_zap_gfn_range(kvm, range->slot->as_id, range->start,
1102 range->end, range->may_block, flush);
1105 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1106 struct kvm_gfn_range *range);
1108 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1109 struct kvm_gfn_range *range,
1110 tdp_handler_t handler)
1112 struct kvm_mmu_page *root;
1113 struct tdp_iter iter;
1119 * Don't support rescheduling, none of the MMU notifiers that funnel
1120 * into this helper allow blocking; it'd be dead, wasteful code.
1122 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1123 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1124 ret |= handler(kvm, &iter, range);
1133 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1134 * if any of the GFNs in the range have been accessed.
1136 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1137 struct kvm_gfn_range *range)
1141 /* If we have a non-accessed entry we don't need to change the pte. */
1142 if (!is_accessed_spte(iter->old_spte))
1145 new_spte = iter->old_spte;
1147 if (spte_ad_enabled(new_spte)) {
1148 new_spte &= ~shadow_accessed_mask;
1151 * Capture the dirty status of the page, so that it doesn't get
1152 * lost when the SPTE is marked for access tracking.
1154 if (is_writable_pte(new_spte))
1155 kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1157 new_spte = mark_spte_for_access_track(new_spte);
1160 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1165 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1167 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1170 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1171 struct kvm_gfn_range *range)
1173 return is_accessed_spte(iter->old_spte);
1176 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1178 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1181 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1182 struct kvm_gfn_range *range)
1186 /* Huge pages aren't expected to be modified without first being zapped. */
1187 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1189 if (iter->level != PG_LEVEL_4K ||
1190 !is_shadow_present_pte(iter->old_spte))
1194 * Note, when changing a read-only SPTE, it's not strictly necessary to
1195 * zero the SPTE before setting the new PFN, but doing so preserves the
1196 * invariant that the PFN of a present * leaf SPTE can never change.
1197 * See __handle_changed_spte().
1199 tdp_mmu_set_spte(kvm, iter, 0);
1201 if (!pte_write(range->pte)) {
1202 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1203 pte_pfn(range->pte));
1205 tdp_mmu_set_spte(kvm, iter, new_spte);
1212 * Handle the changed_pte MMU notifier for the TDP MMU.
1213 * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1215 * Returns non-zero if a flush is needed before releasing the MMU lock.
1217 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1219 bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1221 /* FIXME: return 'flush' instead of flushing here. */
1223 kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
1229 * Remove write access from all SPTEs at or above min_level that map GFNs
1230 * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1233 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1234 gfn_t start, gfn_t end, int min_level)
1236 struct tdp_iter iter;
1238 bool spte_set = false;
1242 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1244 for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1245 min_level, start, end) {
1247 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1250 if (!is_shadow_present_pte(iter.old_spte) ||
1251 !is_last_spte(iter.old_spte, iter.level) ||
1252 !(iter.old_spte & PT_WRITABLE_MASK))
1255 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1257 if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
1260 * The iter must explicitly re-read the SPTE because
1261 * the atomic cmpxchg failed.
1263 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1274 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1275 * only affect leaf SPTEs down to min_level.
1276 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1278 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1279 const struct kvm_memory_slot *slot, int min_level)
1281 struct kvm_mmu_page *root;
1282 bool spte_set = false;
1284 lockdep_assert_held_read(&kvm->mmu_lock);
1286 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1287 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1288 slot->base_gfn + slot->npages, min_level);
1294 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1295 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1296 * If AD bits are not enabled, this will require clearing the writable bit on
1297 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1300 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1301 gfn_t start, gfn_t end)
1303 struct tdp_iter iter;
1305 bool spte_set = false;
1309 tdp_root_for_each_leaf_pte(iter, root, start, end) {
1311 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1314 if (!is_shadow_present_pte(iter.old_spte))
1317 if (spte_ad_need_write_protect(iter.old_spte)) {
1318 if (is_writable_pte(iter.old_spte))
1319 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1323 if (iter.old_spte & shadow_dirty_mask)
1324 new_spte = iter.old_spte & ~shadow_dirty_mask;
1329 if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter,
1332 * The iter must explicitly re-read the SPTE because
1333 * the atomic cmpxchg failed.
1335 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1346 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1347 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1348 * If AD bits are not enabled, this will require clearing the writable bit on
1349 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1352 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1353 const struct kvm_memory_slot *slot)
1355 struct kvm_mmu_page *root;
1356 bool spte_set = false;
1358 lockdep_assert_held_read(&kvm->mmu_lock);
1360 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1361 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1362 slot->base_gfn + slot->npages);
1368 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1369 * set in mask, starting at gfn. The given memslot is expected to contain all
1370 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1371 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1372 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1374 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1375 gfn_t gfn, unsigned long mask, bool wrprot)
1377 struct tdp_iter iter;
1382 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1383 gfn + BITS_PER_LONG) {
1387 if (iter.level > PG_LEVEL_4K ||
1388 !(mask & (1UL << (iter.gfn - gfn))))
1391 mask &= ~(1UL << (iter.gfn - gfn));
1393 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1394 if (is_writable_pte(iter.old_spte))
1395 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1399 if (iter.old_spte & shadow_dirty_mask)
1400 new_spte = iter.old_spte & ~shadow_dirty_mask;
1405 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1412 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1413 * set in mask, starting at gfn. The given memslot is expected to contain all
1414 * the GFNs represented by set bits in the mask. If AD bits are enabled,
1415 * clearing the dirty status will involve clearing the dirty bit on each SPTE
1416 * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1418 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1419 struct kvm_memory_slot *slot,
1420 gfn_t gfn, unsigned long mask,
1423 struct kvm_mmu_page *root;
1425 lockdep_assert_held_write(&kvm->mmu_lock);
1426 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1427 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1431 * Clear leaf entries which could be replaced by large mappings, for
1432 * GFNs within the slot.
1434 static void zap_collapsible_spte_range(struct kvm *kvm,
1435 struct kvm_mmu_page *root,
1436 const struct kvm_memory_slot *slot)
1438 gfn_t start = slot->base_gfn;
1439 gfn_t end = start + slot->npages;
1440 struct tdp_iter iter;
1445 tdp_root_for_each_pte(iter, root, start, end) {
1447 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1450 if (!is_shadow_present_pte(iter.old_spte) ||
1451 !is_last_spte(iter.old_spte, iter.level))
1454 pfn = spte_to_pfn(iter.old_spte);
1455 if (kvm_is_reserved_pfn(pfn) ||
1456 iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1460 /* Note, a successful atomic zap also does a remote TLB flush. */
1461 if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
1463 * The iter must explicitly re-read the SPTE because
1464 * the atomic cmpxchg failed.
1466 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1475 * Clear non-leaf entries (and free associated page tables) which could
1476 * be replaced by large mappings, for GFNs within the slot.
1478 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1479 const struct kvm_memory_slot *slot)
1481 struct kvm_mmu_page *root;
1483 lockdep_assert_held_read(&kvm->mmu_lock);
1485 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1486 zap_collapsible_spte_range(kvm, root, slot);
1490 * Removes write access on the last level SPTE mapping this GFN and unsets the
1491 * MMU-writable bit to ensure future writes continue to be intercepted.
1492 * Returns true if an SPTE was set and a TLB flush is needed.
1494 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1495 gfn_t gfn, int min_level)
1497 struct tdp_iter iter;
1499 bool spte_set = false;
1501 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1505 for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1506 min_level, gfn, gfn + 1) {
1507 if (!is_shadow_present_pte(iter.old_spte) ||
1508 !is_last_spte(iter.old_spte, iter.level))
1511 new_spte = iter.old_spte &
1512 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1514 if (new_spte == iter.old_spte)
1517 tdp_mmu_set_spte(kvm, &iter, new_spte);
1527 * Removes write access on the last level SPTE mapping this GFN and unsets the
1528 * MMU-writable bit to ensure future writes continue to be intercepted.
1529 * Returns true if an SPTE was set and a TLB flush is needed.
1531 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1532 struct kvm_memory_slot *slot, gfn_t gfn,
1535 struct kvm_mmu_page *root;
1536 bool spte_set = false;
1538 lockdep_assert_held_write(&kvm->mmu_lock);
1539 for_each_tdp_mmu_root(kvm, root, slot->as_id)
1540 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1546 * Return the level of the lowest level SPTE added to sptes.
1547 * That SPTE may be non-present.
1549 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1551 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1554 struct tdp_iter iter;
1555 struct kvm_mmu *mmu = vcpu->arch.mmu;
1556 gfn_t gfn = addr >> PAGE_SHIFT;
1559 *root_level = vcpu->arch.mmu->shadow_root_level;
1561 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1563 sptes[leaf] = iter.old_spte;
1570 * Returns the last level spte pointer of the shadow page walk for the given
1571 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1572 * walk could be performed, returns NULL and *spte does not contain valid data.
1575 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1576 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1578 * WARNING: This function is only intended to be called during fast_page_fault.
1580 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1583 struct tdp_iter iter;
1584 struct kvm_mmu *mmu = vcpu->arch.mmu;
1585 gfn_t gfn = addr >> PAGE_SHIFT;
1586 tdp_ptep_t sptep = NULL;
1588 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1589 *spte = iter.old_spte;
1594 * Perform the rcu_dereference to get the raw spte pointer value since
1595 * we are passing it up to fast_page_fault, which is shared with the
1596 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1599 * This is safe since fast_page_fault obeys the contracts of this
1600 * function as well as all TDP MMU contracts around modifying SPTEs
1601 * outside of mmu_lock.
1603 return rcu_dereference(sptep);