arch/loongarch/kvm/mmu.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2020-2023 Loongson Technology Corporation Limited
   4  */
   5
   6 #include <linux/highmem.h>
   7 #include <linux/hugetlb.h>
   8 #include <linux/kvm_host.h>
   9 #include <linux/page-flags.h>
  10 #include <linux/uaccess.h>
  11 #include <asm/mmu_context.h>
  12 #include <asm/pgalloc.h>
  13 #include <asm/tlb.h>
  14 #include <asm/kvm_mmu.h>
  15
  16 static inline void kvm_ptw_prepare(struct kvm *kvm, kvm_ptw_ctx *ctx)
  17 {
  18         ctx->level = kvm->arch.root_level;
  19         /* pte table */
  20         ctx->invalid_ptes  = kvm->arch.invalid_ptes;
  21         ctx->pte_shifts    = kvm->arch.pte_shifts;
  22         ctx->pgtable_shift = ctx->pte_shifts[ctx->level];
  23         ctx->invalid_entry = ctx->invalid_ptes[ctx->level];
  24         ctx->opaque        = kvm;
  25 }
  26
  27 /*
  28  * Mark a range of guest physical address space old (all accesses fault) in the
  29  * VM's GPA page table to allow detection of commonly used pages.
  30  */
  31 static int kvm_mkold_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx)
  32 {
  33         if (kvm_pte_young(*pte)) {
  34                 *pte = kvm_pte_mkold(*pte);
  35                 return 1;
  36         }
  37
  38         return 0;
  39 }
  40
  41 /*
  42  * Mark a range of guest physical address space clean (writes fault) in the VM's
  43  * GPA page table to allow dirty page tracking.
  44  */
  45 static int kvm_mkclean_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx)
  46 {
  47         gfn_t offset;
  48         kvm_pte_t val;
  49
  50         val = *pte;
  51         /*
  52          * For kvm_arch_mmu_enable_log_dirty_pt_masked with mask, start and end
  53          * may cross hugepage, for first huge page parameter addr is equal to
  54          * start, however for the second huge page addr is base address of
  55          * this huge page, rather than start or end address
  56          */
  57         if ((ctx->flag & _KVM_HAS_PGMASK) && !kvm_pte_huge(val)) {
  58                 offset = (addr >> PAGE_SHIFT) - ctx->gfn;
  59                 if (!(BIT(offset) & ctx->mask))
  60                         return 0;
  61         }
  62
  63         /*
  64          * Need not split huge page now, just set write-proect pte bit
  65          * Split huge page until next write fault
  66          */
  67         if (kvm_pte_dirty(val)) {
  68                 *pte = kvm_pte_mkclean(val);
  69                 return 1;
  70         }
  71
  72         return 0;
  73 }
  74
  75 /*
  76  * Clear pte entry
  77  */
  78 static int kvm_flush_pte(kvm_pte_t *pte, phys_addr_t addr, kvm_ptw_ctx *ctx)
  79 {
  80         struct kvm *kvm;
  81
  82         kvm = ctx->opaque;
  83         if (ctx->level)
  84                 kvm->stat.hugepages--;
  85         else
  86                 kvm->stat.pages--;
  87
  88         *pte = ctx->invalid_entry;
  89
  90         return 1;
  91 }
  92
  93 /*
  94  * kvm_pgd_alloc() - Allocate and initialise a KVM GPA page directory.
  95  *
  96  * Allocate a blank KVM GPA page directory (PGD) for representing guest physical
  97  * to host physical page mappings.
  98  *
  99  * Returns:     Pointer to new KVM GPA page directory.
 100  *              NULL on allocation failure.
 101  */
 102 kvm_pte_t *kvm_pgd_alloc(void)
 103 {
 104         kvm_pte_t *pgd;
 105
 106         pgd = (kvm_pte_t *)__get_free_pages(GFP_KERNEL, 0);
 107         if (pgd)
 108                 pgd_init((void *)pgd);
 109
 110         return pgd;
 111 }
 112
 113 static void _kvm_pte_init(void *addr, unsigned long val)
 114 {
 115         unsigned long *p, *end;
 116
 117         p = (unsigned long *)addr;
 118         end = p + PTRS_PER_PTE;
 119         do {
 120                 p[0] = val;
 121                 p[1] = val;
 122                 p[2] = val;
 123                 p[3] = val;
 124                 p[4] = val;
 125                 p += 8;
 126                 p[-3] = val;
 127                 p[-2] = val;
 128                 p[-1] = val;
 129         } while (p != end);
 130 }
 131
 132 /*
 133  * Caller must hold kvm->mm_lock
 134  *
 135  * Walk the page tables of kvm to find the PTE corresponding to the
 136  * address @addr. If page tables don't exist for @addr, they will be created
 137  * from the MMU cache if @cache is not NULL.
 138  */
 139 static kvm_pte_t *kvm_populate_gpa(struct kvm *kvm,
 140                                 struct kvm_mmu_memory_cache *cache,
 141                                 unsigned long addr, int level)
 142 {
 143         kvm_ptw_ctx ctx;
 144         kvm_pte_t *entry, *child;
 145
 146         kvm_ptw_prepare(kvm, &ctx);
 147         child = kvm->arch.pgd;
 148         while (ctx.level > level) {
 149                 entry = kvm_pgtable_offset(&ctx, child, addr);
 150                 if (kvm_pte_none(&ctx, entry)) {
 151                         if (!cache)
 152                                 return NULL;
 153
 154                         child = kvm_mmu_memory_cache_alloc(cache);
 155                         _kvm_pte_init(child, ctx.invalid_ptes[ctx.level - 1]);
 156                         kvm_set_pte(entry, __pa(child));
 157                 } else if (kvm_pte_huge(*entry)) {
 158                         return entry;
 159                 } else
 160                         child = (kvm_pte_t *)__va(PHYSADDR(*entry));
 161                 kvm_ptw_enter(&ctx);
 162         }
 163
 164         entry = kvm_pgtable_offset(&ctx, child, addr);
 165
 166         return entry;
 167 }
 168
 169 /*
 170  * Page walker for VM shadow mmu at last level
 171  * The last level is small pte page or huge pmd page
 172  */
 173 static int kvm_ptw_leaf(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx)
 174 {
 175         int ret;
 176         phys_addr_t next, start, size;
 177         struct list_head *list;
 178         kvm_pte_t *entry, *child;
 179
 180         ret = 0;
 181         start = addr;
 182         child = (kvm_pte_t *)__va(PHYSADDR(*dir));
 183         entry = kvm_pgtable_offset(ctx, child, addr);
 184         do {
 185                 next = addr + (0x1UL << ctx->pgtable_shift);
 186                 if (!kvm_pte_present(ctx, entry))
 187                         continue;
 188
 189                 ret |= ctx->ops(entry, addr, ctx);
 190         } while (entry++, addr = next, addr < end);
 191
 192         if (kvm_need_flush(ctx)) {
 193                 size = 0x1UL << (ctx->pgtable_shift + PAGE_SHIFT - 3);
 194                 if (start + size == end) {
 195                         list = (struct list_head *)child;
 196                         list_add_tail(list, &ctx->list);
 197                         *dir = ctx->invalid_ptes[ctx->level + 1];
 198                 }
 199         }
 200
 201         return ret;
 202 }
 203
 204 /*
 205  * Page walker for VM shadow mmu at page table dir level
 206  */
 207 static int kvm_ptw_dir(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx)
 208 {
 209         int ret;
 210         phys_addr_t next, start, size;
 211         struct list_head *list;
 212         kvm_pte_t *entry, *child;
 213
 214         ret = 0;
 215         start = addr;
 216         child = (kvm_pte_t *)__va(PHYSADDR(*dir));
 217         entry = kvm_pgtable_offset(ctx, child, addr);
 218         do {
 219                 next = kvm_pgtable_addr_end(ctx, addr, end);
 220                 if (!kvm_pte_present(ctx, entry))
 221                         continue;
 222
 223                 if (kvm_pte_huge(*entry)) {
 224                         ret |= ctx->ops(entry, addr, ctx);
 225                         continue;
 226                 }
 227
 228                 kvm_ptw_enter(ctx);
 229                 if (ctx->level == 0)
 230                         ret |= kvm_ptw_leaf(entry, addr, next, ctx);
 231                 else
 232                         ret |= kvm_ptw_dir(entry, addr, next, ctx);
 233                 kvm_ptw_exit(ctx);
 234         }  while (entry++, addr = next, addr < end);
 235
 236         if (kvm_need_flush(ctx)) {
 237                 size = 0x1UL << (ctx->pgtable_shift + PAGE_SHIFT - 3);
 238                 if (start + size == end) {
 239                         list = (struct list_head *)child;
 240                         list_add_tail(list, &ctx->list);
 241                         *dir = ctx->invalid_ptes[ctx->level + 1];
 242                 }
 243         }
 244
 245         return ret;
 246 }
 247
 248 /*
 249  * Page walker for VM shadow mmu at page root table
 250  */
 251 static int kvm_ptw_top(kvm_pte_t *dir, phys_addr_t addr, phys_addr_t end, kvm_ptw_ctx *ctx)
 252 {
 253         int ret;
 254         phys_addr_t next;
 255         kvm_pte_t *entry;
 256
 257         ret = 0;
 258         entry = kvm_pgtable_offset(ctx, dir, addr);
 259         do {
 260                 next = kvm_pgtable_addr_end(ctx, addr, end);
 261                 if (!kvm_pte_present(ctx, entry))
 262                         continue;
 263
 264                 kvm_ptw_enter(ctx);
 265                 ret |= kvm_ptw_dir(entry, addr, next, ctx);
 266                 kvm_ptw_exit(ctx);
 267         }  while (entry++, addr = next, addr < end);
 268
 269         return ret;
 270 }
 271
 272 /*
 273  * kvm_flush_range() - Flush a range of guest physical addresses.
 274  * @kvm:        KVM pointer.
 275  * @start_gfn:  Guest frame number of first page in GPA range to flush.
 276  * @end_gfn:    Guest frame number of last page in GPA range to flush.
 277  * @lock:       Whether to hold mmu_lock or not
 278  *
 279  * Flushes a range of GPA mappings from the GPA page tables.
 280  */
 281 static void kvm_flush_range(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn, int lock)
 282 {
 283         int ret;
 284         kvm_ptw_ctx ctx;
 285         struct list_head *pos, *temp;
 286
 287         ctx.ops = kvm_flush_pte;
 288         ctx.flag = _KVM_FLUSH_PGTABLE;
 289         kvm_ptw_prepare(kvm, &ctx);
 290         INIT_LIST_HEAD(&ctx.list);
 291
 292         if (lock) {
 293                 spin_lock(&kvm->mmu_lock);
 294                 ret = kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT,
 295                                         end_gfn << PAGE_SHIFT, &ctx);
 296                 spin_unlock(&kvm->mmu_lock);
 297         } else
 298                 ret = kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT,
 299                                         end_gfn << PAGE_SHIFT, &ctx);
 300
 301         /* Flush vpid for each vCPU individually */
 302         if (ret)
 303                 kvm_flush_remote_tlbs(kvm);
 304
 305         /*
 306          * free pte table page after mmu_lock
 307          * the pte table page is linked together with ctx.list
 308          */
 309         list_for_each_safe(pos, temp, &ctx.list) {
 310                 list_del(pos);
 311                 free_page((unsigned long)pos);
 312         }
 313 }
 314
 315 /*
 316  * kvm_mkclean_gpa_pt() - Make a range of guest physical addresses clean.
 317  * @kvm:        KVM pointer.
 318  * @start_gfn:  Guest frame number of first page in GPA range to flush.
 319  * @end_gfn:    Guest frame number of last page in GPA range to flush.
 320  *
 321  * Make a range of GPA mappings clean so that guest writes will fault and
 322  * trigger dirty page logging.
 323  *
 324  * The caller must hold the @kvm->mmu_lock spinlock.
 325  *
 326  * Returns:     Whether any GPA mappings were modified, which would require
 327  *              derived mappings (GVA page tables & TLB enties) to be
 328  *              invalidated.
 329  */
 330 static int kvm_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn)
 331 {
 332         kvm_ptw_ctx ctx;
 333
 334         ctx.ops = kvm_mkclean_pte;
 335         ctx.flag = 0;
 336         kvm_ptw_prepare(kvm, &ctx);
 337         return kvm_ptw_top(kvm->arch.pgd, start_gfn << PAGE_SHIFT, end_gfn << PAGE_SHIFT, &ctx);
 338 }
 339
 340 /*
 341  * kvm_arch_mmu_enable_log_dirty_pt_masked() - write protect dirty pages
 342  * @kvm:        The KVM pointer
 343  * @slot:       The memory slot associated with mask
 344  * @gfn_offset: The gfn offset in memory slot
 345  * @mask:       The mask of dirty pages at offset 'gfn_offset' in this memory
 346  *              slot to be write protected
 347  *
 348  * Walks bits set in mask write protects the associated pte's. Caller must
 349  * acquire @kvm->mmu_lock.
 350  */
 351 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 352                 struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask)
 353 {
 354         kvm_ptw_ctx ctx;
 355         gfn_t base_gfn = slot->base_gfn + gfn_offset;
 356         gfn_t start = base_gfn + __ffs(mask);
 357         gfn_t end = base_gfn + __fls(mask) + 1;
 358
 359         ctx.ops = kvm_mkclean_pte;
 360         ctx.flag = _KVM_HAS_PGMASK;
 361         ctx.mask = mask;
 362         ctx.gfn = base_gfn;
 363         kvm_ptw_prepare(kvm, &ctx);
 364
 365         kvm_ptw_top(kvm->arch.pgd, start << PAGE_SHIFT, end << PAGE_SHIFT, &ctx);
 366 }
 367
 368 void kvm_arch_commit_memory_region(struct kvm *kvm,
 369                                    struct kvm_memory_slot *old,
 370                                    const struct kvm_memory_slot *new,
 371                                    enum kvm_mr_change change)
 372 {
 373         int needs_flush;
 374
 375         /*
 376          * If dirty page logging is enabled, write protect all pages in the slot
 377          * ready for dirty logging.
 378          *
 379          * There is no need to do this in any of the following cases:
 380          * CREATE:      No dirty mappings will already exist.
 381          * MOVE/DELETE: The old mappings will already have been cleaned up by
 382          *              kvm_arch_flush_shadow_memslot()
 383          */
 384         if (change == KVM_MR_FLAGS_ONLY &&
 385             (!(old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
 386              new->flags & KVM_MEM_LOG_DIRTY_PAGES)) {
 387                 spin_lock(&kvm->mmu_lock);
 388                 /* Write protect GPA page table entries */
 389                 needs_flush = kvm_mkclean_gpa_pt(kvm, new->base_gfn,
 390                                         new->base_gfn + new->npages);
 391                 spin_unlock(&kvm->mmu_lock);
 392                 if (needs_flush)
 393                         kvm_flush_remote_tlbs(kvm);
 394         }
 395 }
 396
 397 void kvm_arch_flush_shadow_all(struct kvm *kvm)
 398 {
 399         kvm_flush_range(kvm, 0, kvm->arch.gpa_size >> PAGE_SHIFT, 0);
 400 }
 401
 402 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
 403 {
 404         /*
 405          * The slot has been made invalid (ready for moving or deletion), so we
 406          * need to ensure that it can no longer be accessed by any guest vCPUs.
 407          */
 408         kvm_flush_range(kvm, slot->base_gfn, slot->base_gfn + slot->npages, 1);
 409 }
 410
 411 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 412 {
 413         kvm_ptw_ctx ctx;
 414
 415         ctx.flag = 0;
 416         ctx.ops = kvm_flush_pte;
 417         kvm_ptw_prepare(kvm, &ctx);
 418         INIT_LIST_HEAD(&ctx.list);
 419
 420         return kvm_ptw_top(kvm->arch.pgd, range->start << PAGE_SHIFT,
 421                         range->end << PAGE_SHIFT, &ctx);
 422 }
 423
 424 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 425 {
 426         unsigned long prot_bits;
 427         kvm_pte_t *ptep;
 428         kvm_pfn_t pfn = pte_pfn(range->arg.pte);
 429         gpa_t gpa = range->start << PAGE_SHIFT;
 430
 431         ptep = kvm_populate_gpa(kvm, NULL, gpa, 0);
 432         if (!ptep)
 433                 return false;
 434
 435         /* Replacing an absent or old page doesn't need flushes */
 436         if (!kvm_pte_present(NULL, ptep) || !kvm_pte_young(*ptep)) {
 437                 kvm_set_pte(ptep, 0);
 438                 return false;
 439         }
 440
 441         /* Fill new pte if write protected or page migrated */
 442         prot_bits = _PAGE_PRESENT | __READABLE;
 443         prot_bits |= _CACHE_MASK & pte_val(range->arg.pte);
 444
 445         /*
 446          * Set _PAGE_WRITE or _PAGE_DIRTY iff old and new pte both support
 447          * _PAGE_WRITE for map_page_fast if next page write fault
 448          * _PAGE_DIRTY since gpa has already recorded as dirty page
 449          */
 450         prot_bits |= __WRITEABLE & *ptep & pte_val(range->arg.pte);
 451         kvm_set_pte(ptep, kvm_pfn_pte(pfn, __pgprot(prot_bits)));
 452
 453         return true;
 454 }
 455
 456 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 457 {
 458         kvm_ptw_ctx ctx;
 459
 460         ctx.flag = 0;
 461         ctx.ops = kvm_mkold_pte;
 462         kvm_ptw_prepare(kvm, &ctx);
 463
 464         return kvm_ptw_top(kvm->arch.pgd, range->start << PAGE_SHIFT,
 465                                 range->end << PAGE_SHIFT, &ctx);
 466 }
 467
 468 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 469 {
 470         gpa_t gpa = range->start << PAGE_SHIFT;
 471         kvm_pte_t *ptep = kvm_populate_gpa(kvm, NULL, gpa, 0);
 472
 473         if (ptep && kvm_pte_present(NULL, ptep) && kvm_pte_young(*ptep))
 474                 return true;
 475
 476         return false;
 477 }
 478
 479 /*
 480  * kvm_map_page_fast() - Fast path GPA fault handler.
 481  * @vcpu:               vCPU pointer.
 482  * @gpa:                Guest physical address of fault.
 483  * @write:      Whether the fault was due to a write.
 484  *
 485  * Perform fast path GPA fault handling, doing all that can be done without
 486  * calling into KVM. This handles marking old pages young (for idle page
 487  * tracking), and dirtying of clean pages (for dirty page logging).
 488  *
 489  * Returns:     0 on success, in which case we can update derived mappings and
 490  *              resume guest execution.
 491  *              -EFAULT on failure due to absent GPA mapping or write to
 492  *              read-only page, in which case KVM must be consulted.
 493  */
 494 static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool write)
 495 {
 496         int ret = 0;
 497         kvm_pfn_t pfn = 0;
 498         kvm_pte_t *ptep, changed, new;
 499         gfn_t gfn = gpa >> PAGE_SHIFT;
 500         struct kvm *kvm = vcpu->kvm;
 501         struct kvm_memory_slot *slot;
 502
 503         spin_lock(&kvm->mmu_lock);
 504
 505         /* Fast path - just check GPA page table for an existing entry */
 506         ptep = kvm_populate_gpa(kvm, NULL, gpa, 0);
 507         if (!ptep || !kvm_pte_present(NULL, ptep)) {
 508                 ret = -EFAULT;
 509                 goto out;
 510         }
 511
 512         /* Track access to pages marked old */
 513         new = *ptep;
 514         if (!kvm_pte_young(new))
 515                 new = kvm_pte_mkyoung(new);
 516                 /* call kvm_set_pfn_accessed() after unlock */
 517
 518         if (write && !kvm_pte_dirty(new)) {
 519                 if (!kvm_pte_write(new)) {
 520                         ret = -EFAULT;
 521                         goto out;
 522                 }
 523
 524                 if (kvm_pte_huge(new)) {
 525                         /*
 526                          * Do not set write permission when dirty logging is
 527                          * enabled for HugePages
 528                          */
 529                         slot = gfn_to_memslot(kvm, gfn);
 530                         if (kvm_slot_dirty_track_enabled(slot)) {
 531                                 ret = -EFAULT;
 532                                 goto out;
 533                         }
 534                 }
 535
 536                 /* Track dirtying of writeable pages */
 537                 new = kvm_pte_mkdirty(new);
 538         }
 539
 540         changed = new ^ (*ptep);
 541         if (changed) {
 542                 kvm_set_pte(ptep, new);
 543                 pfn = kvm_pte_pfn(new);
 544         }
 545         spin_unlock(&kvm->mmu_lock);
 546
 547         /*
 548          * Fixme: pfn may be freed after mmu_lock
 549          * kvm_try_get_pfn(pfn)/kvm_release_pfn pair to prevent this?
 550          */
 551         if (kvm_pte_young(changed))
 552                 kvm_set_pfn_accessed(pfn);
 553
 554         if (kvm_pte_dirty(changed)) {
 555                 mark_page_dirty(kvm, gfn);
 556                 kvm_set_pfn_dirty(pfn);
 557         }
 558         return ret;
 559 out:
 560         spin_unlock(&kvm->mmu_lock);
 561         return ret;
 562 }
 563
 564 static bool fault_supports_huge_mapping(struct kvm_memory_slot *memslot,
 565                                 unsigned long hva, unsigned long map_size, bool write)
 566 {
 567         size_t size;
 568         gpa_t gpa_start;
 569         hva_t uaddr_start, uaddr_end;
 570
 571         /* Disable dirty logging on HugePages */
 572         if (kvm_slot_dirty_track_enabled(memslot) && write)
 573                 return false;
 574
 575         size = memslot->npages * PAGE_SIZE;
 576         gpa_start = memslot->base_gfn << PAGE_SHIFT;
 577         uaddr_start = memslot->userspace_addr;
 578         uaddr_end = uaddr_start + size;
 579
 580         /*
 581          * Pages belonging to memslots that don't have the same alignment
 582          * within a PMD for userspace and GPA cannot be mapped with stage-2
 583          * PMD entries, because we'll end up mapping the wrong pages.
 584          *
 585          * Consider a layout like the following:
 586          *
 587          *    memslot->userspace_addr:
 588          *    +-----+--------------------+--------------------+---+
 589          *    |abcde|fgh  Stage-1 block  |    Stage-1 block tv|xyz|
 590          *    +-----+--------------------+--------------------+---+
 591          *
 592          *    memslot->base_gfn << PAGE_SIZE:
 593          *      +---+--------------------+--------------------+-----+
 594          *      |abc|def  Stage-2 block  |    Stage-2 block   |tvxyz|
 595          *      +---+--------------------+--------------------+-----+
 596          *
 597          * If we create those stage-2 blocks, we'll end up with this incorrect
 598          * mapping:
 599          *   d -> f
 600          *   e -> g
 601          *   f -> h
 602          */
 603         if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
 604                 return false;
 605
 606         /*
 607          * Next, let's make sure we're not trying to map anything not covered
 608          * by the memslot. This means we have to prohibit block size mappings
 609          * for the beginning and end of a non-block aligned and non-block sized
 610          * memory slot (illustrated by the head and tail parts of the
 611          * userspace view above containing pages 'abcde' and 'xyz',
 612          * respectively).
 613          *
 614          * Note that it doesn't matter if we do the check using the
 615          * userspace_addr or the base_gfn, as both are equally aligned (per
 616          * the check above) and equally sized.
 617          */
 618         return (hva & ~(map_size - 1)) >= uaddr_start &&
 619                 (hva & ~(map_size - 1)) + map_size <= uaddr_end;
 620 }
 621
 622 /*
 623  * Lookup the mapping level for @gfn in the current mm.
 624  *
 625  * WARNING!  Use of host_pfn_mapping_level() requires the caller and the end
 626  * consumer to be tied into KVM's handlers for MMU notifier events!
 627  *
 628  * There are several ways to safely use this helper:
 629  *
 630  * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before
 631  *   consuming it.  In this case, mmu_lock doesn't need to be held during the
 632  *   lookup, but it does need to be held while checking the MMU notifier.
 633  *
 634  * - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation
 635  *   event for the hva.  This can be done by explicit checking the MMU notifier
 636  *   or by ensuring that KVM already has a valid mapping that covers the hva.
 637  *
 638  * - Do not use the result to install new mappings, e.g. use the host mapping
 639  *   level only to decide whether or not to zap an entry.  In this case, it's
 640  *   not required to hold mmu_lock (though it's highly likely the caller will
 641  *   want to hold mmu_lock anyways, e.g. to modify SPTEs).
 642  *
 643  * Note!  The lookup can still race with modifications to host page tables, but
 644  * the above "rules" ensure KVM will not _consume_ the result of the walk if a
 645  * race with the primary MMU occurs.
 646  */
 647 static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
 648                                 const struct kvm_memory_slot *slot)
 649 {
 650         int level = 0;
 651         unsigned long hva;
 652         unsigned long flags;
 653         pgd_t pgd;
 654         p4d_t p4d;
 655         pud_t pud;
 656         pmd_t pmd;
 657
 658         /*
 659          * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
 660          * is not solely for performance, it's also necessary to avoid the
 661          * "writable" check in __gfn_to_hva_many(), which will always fail on
 662          * read-only memslots due to gfn_to_hva() assuming writes.  Earlier
 663          * page fault steps have already verified the guest isn't writing a
 664          * read-only memslot.
 665          */
 666         hva = __gfn_to_hva_memslot(slot, gfn);
 667
 668         /*
 669          * Disable IRQs to prevent concurrent tear down of host page tables,
 670          * e.g. if the primary MMU promotes a P*D to a huge page and then frees
 671          * the original page table.
 672          */
 673         local_irq_save(flags);
 674
 675         /*
 676          * Read each entry once.  As above, a non-leaf entry can be promoted to
 677          * a huge page _during_ this walk.  Re-reading the entry could send the
 678          * walk into the weeks, e.g. p*d_large() returns false (sees the old
 679          * value) and then p*d_offset() walks into the target huge page instead
 680          * of the old page table (sees the new value).
 681          */
 682         pgd = READ_ONCE(*pgd_offset(kvm->mm, hva));
 683         if (pgd_none(pgd))
 684                 goto out;
 685
 686         p4d = READ_ONCE(*p4d_offset(&pgd, hva));
 687         if (p4d_none(p4d) || !p4d_present(p4d))
 688                 goto out;
 689
 690         pud = READ_ONCE(*pud_offset(&p4d, hva));
 691         if (pud_none(pud) || !pud_present(pud))
 692                 goto out;
 693
 694         pmd = READ_ONCE(*pmd_offset(&pud, hva));
 695         if (pmd_none(pmd) || !pmd_present(pmd))
 696                 goto out;
 697
 698         if (kvm_pte_huge(pmd_val(pmd)))
 699                 level = 1;
 700
 701 out:
 702         local_irq_restore(flags);
 703         return level;
 704 }
 705
 706 /*
 707  * Split huge page
 708  */
 709 static kvm_pte_t *kvm_split_huge(struct kvm_vcpu *vcpu, kvm_pte_t *ptep, gfn_t gfn)
 710 {
 711         int i;
 712         kvm_pte_t val, *child;
 713         struct kvm *kvm = vcpu->kvm;
 714         struct kvm_mmu_memory_cache *memcache;
 715
 716         memcache = &vcpu->arch.mmu_page_cache;
 717         child = kvm_mmu_memory_cache_alloc(memcache);
 718         val = kvm_pte_mksmall(*ptep);
 719         for (i = 0; i < PTRS_PER_PTE; i++) {
 720                 kvm_set_pte(child + i, val);
 721                 val += PAGE_SIZE;
 722         }
 723
 724         /* The later kvm_flush_tlb_gpa() will flush hugepage tlb */
 725         kvm_set_pte(ptep, __pa(child));
 726
 727         kvm->stat.hugepages--;
 728         kvm->stat.pages += PTRS_PER_PTE;
 729
 730         return child + (gfn & (PTRS_PER_PTE - 1));
 731 }
 732
 733 /*
 734  * kvm_map_page() - Map a guest physical page.
 735  * @vcpu:               vCPU pointer.
 736  * @gpa:                Guest physical address of fault.
 737  * @write:      Whether the fault was due to a write.
 738  *
 739  * Handle GPA faults by creating a new GPA mapping (or updating an existing
 740  * one).
 741  *
 742  * This takes care of marking pages young or dirty (idle/dirty page tracking),
 743  * asking KVM for the corresponding PFN, and creating a mapping in the GPA page
 744  * tables. Derived mappings (GVA page tables and TLBs) must be handled by the
 745  * caller.
 746  *
 747  * Returns:     0 on success
 748  *              -EFAULT if there is no memory region at @gpa or a write was
 749  *              attempted to a read-only memory region. This is usually handled
 750  *              as an MMIO access.
 751  */
 752 static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool write)
 753 {
 754         bool writeable;
 755         int srcu_idx, err, retry_no = 0, level;
 756         unsigned long hva, mmu_seq, prot_bits;
 757         kvm_pfn_t pfn;
 758         kvm_pte_t *ptep, new_pte;
 759         gfn_t gfn = gpa >> PAGE_SHIFT;
 760         struct kvm *kvm = vcpu->kvm;
 761         struct kvm_memory_slot *memslot;
 762         struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
 763
 764         /* Try the fast path to handle old / clean pages */
 765         srcu_idx = srcu_read_lock(&kvm->srcu);
 766         err = kvm_map_page_fast(vcpu, gpa, write);
 767         if (!err)
 768                 goto out;
 769
 770         memslot = gfn_to_memslot(kvm, gfn);
 771         hva = gfn_to_hva_memslot_prot(memslot, gfn, &writeable);
 772         if (kvm_is_error_hva(hva) || (write && !writeable)) {
 773                 err = -EFAULT;
 774                 goto out;
 775         }
 776
 777         /* We need a minimum of cached pages ready for page table creation */
 778         err = kvm_mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES);
 779         if (err)
 780                 goto out;
 781
 782 retry:
 783         /*
 784          * Used to check for invalidations in progress, of the pfn that is
 785          * returned by pfn_to_pfn_prot below.
 786          */
 787         mmu_seq = kvm->mmu_invalidate_seq;
 788         /*
 789          * Ensure the read of mmu_invalidate_seq isn't reordered with PTE reads in
 790          * gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't
 791          * risk the page we get a reference to getting unmapped before we have a
 792          * chance to grab the mmu_lock without mmu_invalidate_retry() noticing.
 793          *
 794          * This smp_rmb() pairs with the effective smp_wmb() of the combination
 795          * of the pte_unmap_unlock() after the PTE is zapped, and the
 796          * spin_lock() in kvm_mmu_invalidate_invalidate_<page|range_end>() before
 797          * mmu_invalidate_seq is incremented.
 798          */
 799         smp_rmb();
 800
 801         /* Slow path - ask KVM core whether we can access this GPA */
 802         pfn = gfn_to_pfn_prot(kvm, gfn, write, &writeable);
 803         if (is_error_noslot_pfn(pfn)) {
 804                 err = -EFAULT;
 805                 goto out;
 806         }
 807
 808         /* Check if an invalidation has taken place since we got pfn */
 809         spin_lock(&kvm->mmu_lock);
 810         if (mmu_invalidate_retry_hva(kvm, mmu_seq, hva)) {
 811                 /*
 812                  * This can happen when mappings are changed asynchronously, but
 813                  * also synchronously if a COW is triggered by
 814                  * gfn_to_pfn_prot().
 815                  */
 816                 spin_unlock(&kvm->mmu_lock);
 817                 kvm_release_pfn_clean(pfn);
 818                 if (retry_no > 100) {
 819                         retry_no = 0;
 820                         schedule();
 821                 }
 822                 retry_no++;
 823                 goto retry;
 824         }
 825
 826         /*
 827          * For emulated devices such virtio device, actual cache attribute is
 828          * determined by physical machine.
 829          * For pass through physical device, it should be uncachable
 830          */
 831         prot_bits = _PAGE_PRESENT | __READABLE;
 832         if (pfn_valid(pfn))
 833                 prot_bits |= _CACHE_CC;
 834         else
 835                 prot_bits |= _CACHE_SUC;
 836
 837         if (writeable) {
 838                 prot_bits |= _PAGE_WRITE;
 839                 if (write)
 840                         prot_bits |= __WRITEABLE;
 841         }
 842
 843         /* Disable dirty logging on HugePages */
 844         level = 0;
 845         if (!fault_supports_huge_mapping(memslot, hva, PMD_SIZE, write)) {
 846                 level = 0;
 847         } else {
 848                 level = host_pfn_mapping_level(kvm, gfn, memslot);
 849                 if (level == 1) {
 850                         gfn = gfn & ~(PTRS_PER_PTE - 1);
 851                         pfn = pfn & ~(PTRS_PER_PTE - 1);
 852                 }
 853         }
 854
 855         /* Ensure page tables are allocated */
 856         ptep = kvm_populate_gpa(kvm, memcache, gpa, level);
 857         new_pte = kvm_pfn_pte(pfn, __pgprot(prot_bits));
 858         if (level == 1) {
 859                 new_pte = kvm_pte_mkhuge(new_pte);
 860                 /*
 861                  * previous pmd entry is invalid_pte_table
 862                  * there is invalid tlb with small page
 863                  * need flush these invalid tlbs for current vcpu
 864                  */
 865                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 866                 ++kvm->stat.hugepages;
 867         }  else if (kvm_pte_huge(*ptep) && write)
 868                 ptep = kvm_split_huge(vcpu, ptep, gfn);
 869         else
 870                 ++kvm->stat.pages;
 871         kvm_set_pte(ptep, new_pte);
 872         spin_unlock(&kvm->mmu_lock);
 873
 874         if (prot_bits & _PAGE_DIRTY) {
 875                 mark_page_dirty_in_slot(kvm, memslot, gfn);
 876                 kvm_set_pfn_dirty(pfn);
 877         }
 878
 879         kvm_set_pfn_accessed(pfn);
 880         kvm_release_pfn_clean(pfn);
 881 out:
 882         srcu_read_unlock(&kvm->srcu, srcu_idx);
 883         return err;
 884 }
 885
 886 int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long gpa, bool write)
 887 {
 888         int ret;
 889
 890         ret = kvm_map_page(vcpu, gpa, write);
 891         if (ret)
 892                 return ret;
 893
 894         /* Invalidate this entry in the TLB */
 895         kvm_flush_tlb_gpa(vcpu, gpa);
 896
 897         return 0;
 898 }
 899
 900 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 901 {
 902 }
 903
 904 int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_memory_slot *old,
 905                                    struct kvm_memory_slot *new, enum kvm_mr_change change)
 906 {
 907         return 0;
 908 }
 909
 910 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
 911                                         const struct kvm_memory_slot *memslot)
 912 {
 913         kvm_flush_remote_tlbs(kvm);
 914 }