arch/x86/mm/kaiser.c

   1 #include <linux/bug.h>
   2 #include <linux/kernel.h>
   3 #include <linux/errno.h>
   4 #include <linux/string.h>
   5 #include <linux/types.h>
   6 #include <linux/bug.h>
   7 #include <linux/init.h>
   8 #include <linux/interrupt.h>
   9 #include <linux/spinlock.h>
  10 #include <linux/mm.h>
  11 #include <linux/uaccess.h>
  12 #include <linux/ftrace.h>
  13 #include <linux/cpu.h>
  14
  15 #undef pr_fmt
  16 #define pr_fmt(fmt)     "Kernel/User page tables isolation: " fmt
  17
  18 #include <asm/kaiser.h>
  19 #include <asm/tlbflush.h>       /* to verify its kaiser declarations */
  20 #include <asm/pgtable.h>
  21 #include <asm/pgalloc.h>
  22 #include <asm/desc.h>
  23 #include <asm/cmdline.h>
  24 #include <asm/vsyscall.h>
  25
  26 int kaiser_enabled __read_mostly = 1;
  27 EXPORT_SYMBOL(kaiser_enabled);  /* for inlined TLB flush functions */
  28
  29 __visible
  30 DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
  31
  32 /*
  33  * These can have bit 63 set, so we can not just use a plain "or"
  34  * instruction to get their value or'd into CR3.  It would take
  35  * another register.  So, we use a memory reference to these instead.
  36  *
  37  * This is also handy because systems that do not support PCIDs
  38  * just end up or'ing a 0 into their CR3, which does no harm.
  39  */
  40 DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
  41
  42 /*
  43  * At runtime, the only things we map are some things for CPU
  44  * hotplug, and stacks for new processes.  No two CPUs will ever
  45  * be populating the same addresses, so we only need to ensure
  46  * that we protect between two CPUs trying to allocate and
  47  * populate the same page table page.
  48  *
  49  * Only take this lock when doing a set_p[4um]d(), but it is not
  50  * needed for doing a set_pte().  We assume that only the *owner*
  51  * of a given allocation will be doing this for _their_
  52  * allocation.
  53  *
  54  * This ensures that once a system has been running for a while
  55  * and there have been stacks all over and these page tables
  56  * are fully populated, there will be no further acquisitions of
  57  * this lock.
  58  */
  59 static DEFINE_SPINLOCK(shadow_table_allocation_lock);
  60
  61 /*
  62  * Returns -1 on error.
  63  */
  64 static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
  65 {
  66         pgd_t *pgd;
  67         pud_t *pud;
  68         pmd_t *pmd;
  69         pte_t *pte;
  70
  71         pgd = pgd_offset_k(vaddr);
  72         /*
  73          * We made all the kernel PGDs present in kaiser_init().
  74          * We expect them to stay that way.
  75          */
  76         BUG_ON(pgd_none(*pgd));
  77         /*
  78          * PGDs are either 512GB or 128TB on all x86_64
  79          * configurations.  We don't handle these.
  80          */
  81         BUG_ON(pgd_large(*pgd));
  82
  83         pud = pud_offset(pgd, vaddr);
  84         if (pud_none(*pud)) {
  85                 WARN_ON_ONCE(1);
  86                 return -1;
  87         }
  88
  89         if (pud_large(*pud))
  90                 return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
  91
  92         pmd = pmd_offset(pud, vaddr);
  93         if (pmd_none(*pmd)) {
  94                 WARN_ON_ONCE(1);
  95                 return -1;
  96         }
  97
  98         if (pmd_large(*pmd))
  99                 return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
 100
 101         pte = pte_offset_kernel(pmd, vaddr);
 102         if (pte_none(*pte)) {
 103                 WARN_ON_ONCE(1);
 104                 return -1;
 105         }
 106
 107         return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
 108 }
 109
 110 /*
 111  * This is a relatively normal page table walk, except that it
 112  * also tries to allocate page tables pages along the way.
 113  *
 114  * Returns a pointer to a PTE on success, or NULL on failure.
 115  */
 116 static pte_t *kaiser_pagetable_walk(unsigned long address, bool user)
 117 {
 118         pmd_t *pmd;
 119         pud_t *pud;
 120         pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
 121         gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
 122         unsigned long prot = _KERNPG_TABLE;
 123
 124         if (pgd_none(*pgd)) {
 125                 WARN_ONCE(1, "All shadow pgds should have been populated");
 126                 return NULL;
 127         }
 128         BUILD_BUG_ON(pgd_large(*pgd) != 0);
 129
 130         if (user) {
 131                 /*
 132                  * The vsyscall page is the only page that will have
 133                  *  _PAGE_USER set. Catch everything else.
 134                  */
 135                 BUG_ON(address != VSYSCALL_ADDR);
 136
 137                 set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
 138                 prot = _PAGE_TABLE;
 139         }
 140
 141         pud = pud_offset(pgd, address);
 142         /* The shadow page tables do not use large mappings: */
 143         if (pud_large(*pud)) {
 144                 WARN_ON(1);
 145                 return NULL;
 146         }
 147         if (pud_none(*pud)) {
 148                 unsigned long new_pmd_page = __get_free_page(gfp);
 149                 if (!new_pmd_page)
 150                         return NULL;
 151                 spin_lock(&shadow_table_allocation_lock);
 152                 if (pud_none(*pud)) {
 153                         set_pud(pud, __pud(prot | __pa(new_pmd_page)));
 154                         __inc_zone_page_state(virt_to_page((void *)
 155                                                 new_pmd_page), NR_KAISERTABLE);
 156                 } else
 157                         free_page(new_pmd_page);
 158                 spin_unlock(&shadow_table_allocation_lock);
 159         }
 160
 161         pmd = pmd_offset(pud, address);
 162         /* The shadow page tables do not use large mappings: */
 163         if (pmd_large(*pmd)) {
 164                 WARN_ON(1);
 165                 return NULL;
 166         }
 167         if (pmd_none(*pmd)) {
 168                 unsigned long new_pte_page = __get_free_page(gfp);
 169                 if (!new_pte_page)
 170                         return NULL;
 171                 spin_lock(&shadow_table_allocation_lock);
 172                 if (pmd_none(*pmd)) {
 173                         set_pmd(pmd, __pmd(prot | __pa(new_pte_page)));
 174                         __inc_zone_page_state(virt_to_page((void *)
 175                                                 new_pte_page), NR_KAISERTABLE);
 176                 } else
 177                         free_page(new_pte_page);
 178                 spin_unlock(&shadow_table_allocation_lock);
 179         }
 180
 181         return pte_offset_kernel(pmd, address);
 182 }
 183
 184 static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
 185                                unsigned long flags)
 186 {
 187         int ret = 0;
 188         pte_t *pte;
 189         unsigned long start_addr = (unsigned long )__start_addr;
 190         unsigned long address = start_addr & PAGE_MASK;
 191         unsigned long end_addr = PAGE_ALIGN(start_addr + size);
 192         unsigned long target_address;
 193
 194         /*
 195          * It is convenient for callers to pass in __PAGE_KERNEL etc,
 196          * and there is no actual harm from setting _PAGE_GLOBAL, so
 197          * long as CR4.PGE is not set.  But it is nonetheless troubling
 198          * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
 199          * requires that not to be #defined to 0): so mask it off here.
 200          */
 201         flags &= ~_PAGE_GLOBAL;
 202         if (!(__supported_pte_mask & _PAGE_NX))
 203                 flags &= ~_PAGE_NX;
 204
 205         for (; address < end_addr; address += PAGE_SIZE) {
 206                 target_address = get_pa_from_mapping(address);
 207                 if (target_address == -1) {
 208                         ret = -EIO;
 209                         break;
 210                 }
 211                 pte = kaiser_pagetable_walk(address, flags & _PAGE_USER);
 212                 if (!pte) {
 213                         ret = -ENOMEM;
 214                         break;
 215                 }
 216                 if (pte_none(*pte)) {
 217                         set_pte(pte, __pte(flags | target_address));
 218                 } else {
 219                         pte_t tmp;
 220                         set_pte(&tmp, __pte(flags | target_address));
 221                         WARN_ON_ONCE(!pte_same(*pte, tmp));
 222                 }
 223         }
 224         return ret;
 225 }
 226
 227 static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
 228 {
 229         unsigned long size = end - start;
 230
 231         return kaiser_add_user_map(start, size, flags);
 232 }
 233
 234 /*
 235  * Ensure that the top level of the (shadow) page tables are
 236  * entirely populated.  This ensures that all processes that get
 237  * forked have the same entries.  This way, we do not have to
 238  * ever go set up new entries in older processes.
 239  *
 240  * Note: we never free these, so there are no updates to them
 241  * after this.
 242  */
 243 static void __init kaiser_init_all_pgds(void)
 244 {
 245         pgd_t *pgd;
 246         int i = 0;
 247
 248         pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
 249         for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
 250                 pgd_t new_pgd;
 251                 pud_t *pud = pud_alloc_one(&init_mm,
 252                                            PAGE_OFFSET + i * PGDIR_SIZE);
 253                 if (!pud) {
 254                         WARN_ON(1);
 255                         break;
 256                 }
 257                 inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
 258                 new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
 259                 /*
 260                  * Make sure not to stomp on some other pgd entry.
 261                  */
 262                 if (!pgd_none(pgd[i])) {
 263                         WARN_ON(1);
 264                         continue;
 265                 }
 266                 set_pgd(pgd + i, new_pgd);
 267         }
 268 }
 269
 270 #define kaiser_add_user_map_early(start, size, flags) do {      \
 271         int __ret = kaiser_add_user_map(start, size, flags);    \
 272         WARN_ON(__ret);                                         \
 273 } while (0)
 274
 275 #define kaiser_add_user_map_ptrs_early(start, end, flags) do {          \
 276         int __ret = kaiser_add_user_map_ptrs(start, end, flags);        \
 277         WARN_ON(__ret);                                                 \
 278 } while (0)
 279
 280 void __init kaiser_check_boottime_disable(void)
 281 {
 282         bool enable = true;
 283         char arg[5];
 284         int ret;
 285
 286         if (boot_cpu_has(X86_FEATURE_XENPV))
 287                 goto silent_disable;
 288
 289         ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
 290         if (ret > 0) {
 291                 if (!strncmp(arg, "on", 2))
 292                         goto enable;
 293
 294                 if (!strncmp(arg, "off", 3))
 295                         goto disable;
 296
 297                 if (!strncmp(arg, "auto", 4))
 298                         goto skip;
 299         }
 300
 301         if (cmdline_find_option_bool(boot_command_line, "nopti") ||
 302             cpu_mitigations_off())
 303                 goto disable;
 304
 305 skip:
 306         if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
 307                 goto disable;
 308
 309 enable:
 310         if (enable)
 311                 setup_force_cpu_cap(X86_FEATURE_KAISER);
 312
 313         return;
 314
 315 disable:
 316         pr_info("disabled\n");
 317
 318 silent_disable:
 319         kaiser_enabled = 0;
 320         setup_clear_cpu_cap(X86_FEATURE_KAISER);
 321 }
 322
 323 /*
 324  * If anything in here fails, we will likely die on one of the
 325  * first kernel->user transitions and init will die.  But, we
 326  * will have most of the kernel up by then and should be able to
 327  * get a clean warning out of it.  If we BUG_ON() here, we run
 328  * the risk of being before we have good console output.
 329  */
 330 void __init kaiser_init(void)
 331 {
 332         int cpu;
 333
 334         if (!kaiser_enabled)
 335                 return;
 336
 337         kaiser_init_all_pgds();
 338
 339         /*
 340          * Note that this sets _PAGE_USER and it needs to happen when the
 341          * pagetable hierarchy gets created, i.e., early. Otherwise
 342          * kaiser_pagetable_walk() will encounter initialized PTEs in the
 343          * hierarchy and not set the proper permissions, leading to the
 344          * pagefaults with page-protection violations when trying to read the
 345          * vsyscall page. For example.
 346          */
 347         if (vsyscall_enabled())
 348                 kaiser_add_user_map_early((void *)VSYSCALL_ADDR,
 349                                           PAGE_SIZE,
 350                                           vsyscall_pgprot);
 351
 352         for_each_possible_cpu(cpu) {
 353                 void *percpu_vaddr = __per_cpu_user_mapped_start +
 354                                      per_cpu_offset(cpu);
 355                 unsigned long percpu_sz = __per_cpu_user_mapped_end -
 356                                           __per_cpu_user_mapped_start;
 357                 kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
 358                                           __PAGE_KERNEL);
 359         }
 360
 361         /*
 362          * Map the entry/exit text section, which is needed at
 363          * switches from user to and from kernel.
 364          */
 365         kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
 366                                        __PAGE_KERNEL_RX);
 367
 368 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 369         kaiser_add_user_map_ptrs_early(__irqentry_text_start,
 370                                        __irqentry_text_end,
 371                                        __PAGE_KERNEL_RX);
 372 #endif
 373         kaiser_add_user_map_early((void *)idt_descr.address,
 374                                   sizeof(gate_desc) * NR_VECTORS,
 375                                   __PAGE_KERNEL_RO);
 376 #ifdef CONFIG_TRACING
 377         kaiser_add_user_map_early(&trace_idt_descr,
 378                                   sizeof(trace_idt_descr),
 379                                   __PAGE_KERNEL);
 380         kaiser_add_user_map_early(&trace_idt_table,
 381                                   sizeof(gate_desc) * NR_VECTORS,
 382                                   __PAGE_KERNEL);
 383 #endif
 384         kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
 385                                   __PAGE_KERNEL);
 386         kaiser_add_user_map_early(&debug_idt_table,
 387                                   sizeof(gate_desc) * NR_VECTORS,
 388                                   __PAGE_KERNEL);
 389
 390         pr_info("enabled\n");
 391 }
 392
 393 /* Add a mapping to the shadow mapping, and synchronize the mappings */
 394 int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
 395 {
 396         if (!kaiser_enabled)
 397                 return 0;
 398         return kaiser_add_user_map((const void *)addr, size, flags);
 399 }
 400
 401 void kaiser_remove_mapping(unsigned long start, unsigned long size)
 402 {
 403         extern void unmap_pud_range_nofree(pgd_t *pgd,
 404                                 unsigned long start, unsigned long end);
 405         unsigned long end = start + size;
 406         unsigned long addr, next;
 407         pgd_t *pgd;
 408
 409         if (!kaiser_enabled)
 410                 return;
 411         pgd = native_get_shadow_pgd(pgd_offset_k(start));
 412         for (addr = start; addr < end; pgd++, addr = next) {
 413                 next = pgd_addr_end(addr, end);
 414                 unmap_pud_range_nofree(pgd, addr, next);
 415         }
 416 }
 417
 418 /*
 419  * Page table pages are page-aligned.  The lower half of the top
 420  * level is used for userspace and the top half for the kernel.
 421  * This returns true for user pages that need to get copied into
 422  * both the user and kernel copies of the page tables, and false
 423  * for kernel pages that should only be in the kernel copy.
 424  */
 425 static inline bool is_userspace_pgd(pgd_t *pgdp)
 426 {
 427         return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
 428 }
 429
 430 pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
 431 {
 432         if (!kaiser_enabled)
 433                 return pgd;
 434         /*
 435          * Do we need to also populate the shadow pgd?  Check _PAGE_USER to
 436          * skip cases like kexec and EFI which make temporary low mappings.
 437          */
 438         if (pgd.pgd & _PAGE_USER) {
 439                 if (is_userspace_pgd(pgdp)) {
 440                         native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
 441                         /*
 442                          * Even if the entry is *mapping* userspace, ensure
 443                          * that userspace can not use it.  This way, if we
 444                          * get out to userspace running on the kernel CR3,
 445                          * userspace will crash instead of running.
 446                          */
 447                         if (__supported_pte_mask & _PAGE_NX)
 448                                 pgd.pgd |= _PAGE_NX;
 449                 }
 450         } else if (!pgd.pgd) {
 451                 /*
 452                  * pgd_clear() cannot check _PAGE_USER, and is even used to
 453                  * clear corrupted pgd entries: so just rely on cases like
 454                  * kexec and EFI never to be using pgd_clear().
 455                  */
 456                 if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
 457                     is_userspace_pgd(pgdp))
 458                         native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
 459         }
 460         return pgd;
 461 }
 462
 463 void kaiser_setup_pcid(void)
 464 {
 465         unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
 466
 467         if (this_cpu_has(X86_FEATURE_PCID))
 468                 user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
 469         /*
 470          * These variables are used by the entry/exit
 471          * code to change PCID and pgd and TLB flushing.
 472          */
 473         this_cpu_write(x86_cr3_pcid_user, user_cr3);
 474 }
 475
 476 /*
 477  * Make a note that this cpu will need to flush USER tlb on return to user.
 478  * If cpu does not have PCID, then the NOFLUSH bit will never have been set.
 479  */
 480 void kaiser_flush_tlb_on_return_to_user(void)
 481 {
 482         if (this_cpu_has(X86_FEATURE_PCID))
 483                 this_cpu_write(x86_cr3_pcid_user,
 484                         X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
 485 }
 486 EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);