GNU Linux-libre 4.4.289-gnu1
[releases.git] / arch / x86 / mm / kaiser.c
1 #include <linux/bug.h>
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/string.h>
5 #include <linux/types.h>
6 #include <linux/bug.h>
7 #include <linux/init.h>
8 #include <linux/interrupt.h>
9 #include <linux/spinlock.h>
10 #include <linux/mm.h>
11 #include <linux/uaccess.h>
12 #include <linux/ftrace.h>
13 #include <linux/cpu.h>
14
15 #undef pr_fmt
16 #define pr_fmt(fmt)     "Kernel/User page tables isolation: " fmt
17
18 #include <asm/kaiser.h>
19 #include <asm/tlbflush.h>       /* to verify its kaiser declarations */
20 #include <asm/pgtable.h>
21 #include <asm/pgalloc.h>
22 #include <asm/desc.h>
23 #include <asm/cmdline.h>
24 #include <asm/vsyscall.h>
25
26 int kaiser_enabled __read_mostly = 1;
27 EXPORT_SYMBOL(kaiser_enabled);  /* for inlined TLB flush functions */
28
29 __visible
30 DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
31
32 /*
33  * These can have bit 63 set, so we can not just use a plain "or"
34  * instruction to get their value or'd into CR3.  It would take
35  * another register.  So, we use a memory reference to these instead.
36  *
37  * This is also handy because systems that do not support PCIDs
38  * just end up or'ing a 0 into their CR3, which does no harm.
39  */
40 DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
41
42 /*
43  * At runtime, the only things we map are some things for CPU
44  * hotplug, and stacks for new processes.  No two CPUs will ever
45  * be populating the same addresses, so we only need to ensure
46  * that we protect between two CPUs trying to allocate and
47  * populate the same page table page.
48  *
49  * Only take this lock when doing a set_p[4um]d(), but it is not
50  * needed for doing a set_pte().  We assume that only the *owner*
51  * of a given allocation will be doing this for _their_
52  * allocation.
53  *
54  * This ensures that once a system has been running for a while
55  * and there have been stacks all over and these page tables
56  * are fully populated, there will be no further acquisitions of
57  * this lock.
58  */
59 static DEFINE_SPINLOCK(shadow_table_allocation_lock);
60
61 /*
62  * Returns -1 on error.
63  */
64 static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
65 {
66         pgd_t *pgd;
67         pud_t *pud;
68         pmd_t *pmd;
69         pte_t *pte;
70
71         pgd = pgd_offset_k(vaddr);
72         /*
73          * We made all the kernel PGDs present in kaiser_init().
74          * We expect them to stay that way.
75          */
76         BUG_ON(pgd_none(*pgd));
77         /*
78          * PGDs are either 512GB or 128TB on all x86_64
79          * configurations.  We don't handle these.
80          */
81         BUG_ON(pgd_large(*pgd));
82
83         pud = pud_offset(pgd, vaddr);
84         if (pud_none(*pud)) {
85                 WARN_ON_ONCE(1);
86                 return -1;
87         }
88
89         if (pud_large(*pud))
90                 return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
91
92         pmd = pmd_offset(pud, vaddr);
93         if (pmd_none(*pmd)) {
94                 WARN_ON_ONCE(1);
95                 return -1;
96         }
97
98         if (pmd_large(*pmd))
99                 return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
100
101         pte = pte_offset_kernel(pmd, vaddr);
102         if (pte_none(*pte)) {
103                 WARN_ON_ONCE(1);
104                 return -1;
105         }
106
107         return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
108 }
109
110 /*
111  * This is a relatively normal page table walk, except that it
112  * also tries to allocate page tables pages along the way.
113  *
114  * Returns a pointer to a PTE on success, or NULL on failure.
115  */
116 static pte_t *kaiser_pagetable_walk(unsigned long address, bool user)
117 {
118         pmd_t *pmd;
119         pud_t *pud;
120         pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
121         gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
122         unsigned long prot = _KERNPG_TABLE;
123
124         if (pgd_none(*pgd)) {
125                 WARN_ONCE(1, "All shadow pgds should have been populated");
126                 return NULL;
127         }
128         BUILD_BUG_ON(pgd_large(*pgd) != 0);
129
130         if (user) {
131                 /*
132                  * The vsyscall page is the only page that will have
133                  *  _PAGE_USER set. Catch everything else.
134                  */
135                 BUG_ON(address != VSYSCALL_ADDR);
136
137                 set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
138                 prot = _PAGE_TABLE;
139         }
140
141         pud = pud_offset(pgd, address);
142         /* The shadow page tables do not use large mappings: */
143         if (pud_large(*pud)) {
144                 WARN_ON(1);
145                 return NULL;
146         }
147         if (pud_none(*pud)) {
148                 unsigned long new_pmd_page = __get_free_page(gfp);
149                 if (!new_pmd_page)
150                         return NULL;
151                 spin_lock(&shadow_table_allocation_lock);
152                 if (pud_none(*pud)) {
153                         set_pud(pud, __pud(prot | __pa(new_pmd_page)));
154                         __inc_zone_page_state(virt_to_page((void *)
155                                                 new_pmd_page), NR_KAISERTABLE);
156                 } else
157                         free_page(new_pmd_page);
158                 spin_unlock(&shadow_table_allocation_lock);
159         }
160
161         pmd = pmd_offset(pud, address);
162         /* The shadow page tables do not use large mappings: */
163         if (pmd_large(*pmd)) {
164                 WARN_ON(1);
165                 return NULL;
166         }
167         if (pmd_none(*pmd)) {
168                 unsigned long new_pte_page = __get_free_page(gfp);
169                 if (!new_pte_page)
170                         return NULL;
171                 spin_lock(&shadow_table_allocation_lock);
172                 if (pmd_none(*pmd)) {
173                         set_pmd(pmd, __pmd(prot | __pa(new_pte_page)));
174                         __inc_zone_page_state(virt_to_page((void *)
175                                                 new_pte_page), NR_KAISERTABLE);
176                 } else
177                         free_page(new_pte_page);
178                 spin_unlock(&shadow_table_allocation_lock);
179         }
180
181         return pte_offset_kernel(pmd, address);
182 }
183
184 static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
185                                unsigned long flags)
186 {
187         int ret = 0;
188         pte_t *pte;
189         unsigned long start_addr = (unsigned long )__start_addr;
190         unsigned long address = start_addr & PAGE_MASK;
191         unsigned long end_addr = PAGE_ALIGN(start_addr + size);
192         unsigned long target_address;
193
194         /*
195          * It is convenient for callers to pass in __PAGE_KERNEL etc,
196          * and there is no actual harm from setting _PAGE_GLOBAL, so
197          * long as CR4.PGE is not set.  But it is nonetheless troubling
198          * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
199          * requires that not to be #defined to 0): so mask it off here.
200          */
201         flags &= ~_PAGE_GLOBAL;
202         if (!(__supported_pte_mask & _PAGE_NX))
203                 flags &= ~_PAGE_NX;
204
205         for (; address < end_addr; address += PAGE_SIZE) {
206                 target_address = get_pa_from_mapping(address);
207                 if (target_address == -1) {
208                         ret = -EIO;
209                         break;
210                 }
211                 pte = kaiser_pagetable_walk(address, flags & _PAGE_USER);
212                 if (!pte) {
213                         ret = -ENOMEM;
214                         break;
215                 }
216                 if (pte_none(*pte)) {
217                         set_pte(pte, __pte(flags | target_address));
218                 } else {
219                         pte_t tmp;
220                         set_pte(&tmp, __pte(flags | target_address));
221                         WARN_ON_ONCE(!pte_same(*pte, tmp));
222                 }
223         }
224         return ret;
225 }
226
227 static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
228 {
229         unsigned long size = end - start;
230
231         return kaiser_add_user_map(start, size, flags);
232 }
233
234 /*
235  * Ensure that the top level of the (shadow) page tables are
236  * entirely populated.  This ensures that all processes that get
237  * forked have the same entries.  This way, we do not have to
238  * ever go set up new entries in older processes.
239  *
240  * Note: we never free these, so there are no updates to them
241  * after this.
242  */
243 static void __init kaiser_init_all_pgds(void)
244 {
245         pgd_t *pgd;
246         int i = 0;
247
248         pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
249         for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
250                 pgd_t new_pgd;
251                 pud_t *pud = pud_alloc_one(&init_mm,
252                                            PAGE_OFFSET + i * PGDIR_SIZE);
253                 if (!pud) {
254                         WARN_ON(1);
255                         break;
256                 }
257                 inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
258                 new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
259                 /*
260                  * Make sure not to stomp on some other pgd entry.
261                  */
262                 if (!pgd_none(pgd[i])) {
263                         WARN_ON(1);
264                         continue;
265                 }
266                 set_pgd(pgd + i, new_pgd);
267         }
268 }
269
270 #define kaiser_add_user_map_early(start, size, flags) do {      \
271         int __ret = kaiser_add_user_map(start, size, flags);    \
272         WARN_ON(__ret);                                         \
273 } while (0)
274
275 #define kaiser_add_user_map_ptrs_early(start, end, flags) do {          \
276         int __ret = kaiser_add_user_map_ptrs(start, end, flags);        \
277         WARN_ON(__ret);                                                 \
278 } while (0)
279
280 void __init kaiser_check_boottime_disable(void)
281 {
282         bool enable = true;
283         char arg[5];
284         int ret;
285
286         if (boot_cpu_has(X86_FEATURE_XENPV))
287                 goto silent_disable;
288
289         ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
290         if (ret > 0) {
291                 if (!strncmp(arg, "on", 2))
292                         goto enable;
293
294                 if (!strncmp(arg, "off", 3))
295                         goto disable;
296
297                 if (!strncmp(arg, "auto", 4))
298                         goto skip;
299         }
300
301         if (cmdline_find_option_bool(boot_command_line, "nopti") ||
302             cpu_mitigations_off())
303                 goto disable;
304
305 skip:
306         if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
307                 goto disable;
308
309 enable:
310         if (enable)
311                 setup_force_cpu_cap(X86_FEATURE_KAISER);
312
313         return;
314
315 disable:
316         pr_info("disabled\n");
317
318 silent_disable:
319         kaiser_enabled = 0;
320         setup_clear_cpu_cap(X86_FEATURE_KAISER);
321 }
322
323 /*
324  * If anything in here fails, we will likely die on one of the
325  * first kernel->user transitions and init will die.  But, we
326  * will have most of the kernel up by then and should be able to
327  * get a clean warning out of it.  If we BUG_ON() here, we run
328  * the risk of being before we have good console output.
329  */
330 void __init kaiser_init(void)
331 {
332         int cpu;
333
334         if (!kaiser_enabled)
335                 return;
336
337         kaiser_init_all_pgds();
338
339         /*
340          * Note that this sets _PAGE_USER and it needs to happen when the
341          * pagetable hierarchy gets created, i.e., early. Otherwise
342          * kaiser_pagetable_walk() will encounter initialized PTEs in the
343          * hierarchy and not set the proper permissions, leading to the
344          * pagefaults with page-protection violations when trying to read the
345          * vsyscall page. For example.
346          */
347         if (vsyscall_enabled())
348                 kaiser_add_user_map_early((void *)VSYSCALL_ADDR,
349                                           PAGE_SIZE,
350                                           vsyscall_pgprot);
351
352         for_each_possible_cpu(cpu) {
353                 void *percpu_vaddr = __per_cpu_user_mapped_start +
354                                      per_cpu_offset(cpu);
355                 unsigned long percpu_sz = __per_cpu_user_mapped_end -
356                                           __per_cpu_user_mapped_start;
357                 kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
358                                           __PAGE_KERNEL);
359         }
360
361         /*
362          * Map the entry/exit text section, which is needed at
363          * switches from user to and from kernel.
364          */
365         kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
366                                        __PAGE_KERNEL_RX);
367
368 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
369         kaiser_add_user_map_ptrs_early(__irqentry_text_start,
370                                        __irqentry_text_end,
371                                        __PAGE_KERNEL_RX);
372 #endif
373         kaiser_add_user_map_early((void *)idt_descr.address,
374                                   sizeof(gate_desc) * NR_VECTORS,
375                                   __PAGE_KERNEL_RO);
376 #ifdef CONFIG_TRACING
377         kaiser_add_user_map_early(&trace_idt_descr,
378                                   sizeof(trace_idt_descr),
379                                   __PAGE_KERNEL);
380         kaiser_add_user_map_early(&trace_idt_table,
381                                   sizeof(gate_desc) * NR_VECTORS,
382                                   __PAGE_KERNEL);
383 #endif
384         kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
385                                   __PAGE_KERNEL);
386         kaiser_add_user_map_early(&debug_idt_table,
387                                   sizeof(gate_desc) * NR_VECTORS,
388                                   __PAGE_KERNEL);
389
390         pr_info("enabled\n");
391 }
392
393 /* Add a mapping to the shadow mapping, and synchronize the mappings */
394 int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
395 {
396         if (!kaiser_enabled)
397                 return 0;
398         return kaiser_add_user_map((const void *)addr, size, flags);
399 }
400
401 void kaiser_remove_mapping(unsigned long start, unsigned long size)
402 {
403         extern void unmap_pud_range_nofree(pgd_t *pgd,
404                                 unsigned long start, unsigned long end);
405         unsigned long end = start + size;
406         unsigned long addr, next;
407         pgd_t *pgd;
408
409         if (!kaiser_enabled)
410                 return;
411         pgd = native_get_shadow_pgd(pgd_offset_k(start));
412         for (addr = start; addr < end; pgd++, addr = next) {
413                 next = pgd_addr_end(addr, end);
414                 unmap_pud_range_nofree(pgd, addr, next);
415         }
416 }
417
418 /*
419  * Page table pages are page-aligned.  The lower half of the top
420  * level is used for userspace and the top half for the kernel.
421  * This returns true for user pages that need to get copied into
422  * both the user and kernel copies of the page tables, and false
423  * for kernel pages that should only be in the kernel copy.
424  */
425 static inline bool is_userspace_pgd(pgd_t *pgdp)
426 {
427         return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
428 }
429
430 pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
431 {
432         if (!kaiser_enabled)
433                 return pgd;
434         /*
435          * Do we need to also populate the shadow pgd?  Check _PAGE_USER to
436          * skip cases like kexec and EFI which make temporary low mappings.
437          */
438         if (pgd.pgd & _PAGE_USER) {
439                 if (is_userspace_pgd(pgdp)) {
440                         native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
441                         /*
442                          * Even if the entry is *mapping* userspace, ensure
443                          * that userspace can not use it.  This way, if we
444                          * get out to userspace running on the kernel CR3,
445                          * userspace will crash instead of running.
446                          */
447                         if (__supported_pte_mask & _PAGE_NX)
448                                 pgd.pgd |= _PAGE_NX;
449                 }
450         } else if (!pgd.pgd) {
451                 /*
452                  * pgd_clear() cannot check _PAGE_USER, and is even used to
453                  * clear corrupted pgd entries: so just rely on cases like
454                  * kexec and EFI never to be using pgd_clear().
455                  */
456                 if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
457                     is_userspace_pgd(pgdp))
458                         native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
459         }
460         return pgd;
461 }
462
463 void kaiser_setup_pcid(void)
464 {
465         unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
466
467         if (this_cpu_has(X86_FEATURE_PCID))
468                 user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
469         /*
470          * These variables are used by the entry/exit
471          * code to change PCID and pgd and TLB flushing.
472          */
473         this_cpu_write(x86_cr3_pcid_user, user_cr3);
474 }
475
476 /*
477  * Make a note that this cpu will need to flush USER tlb on return to user.
478  * If cpu does not have PCID, then the NOFLUSH bit will never have been set.
479  */
480 void kaiser_flush_tlb_on_return_to_user(void)
481 {
482         if (this_cpu_has(X86_FEATURE_PCID))
483                 this_cpu_write(x86_cr3_pcid_user,
484                         X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
485 }
486 EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);