2 * Debug helper to dump the current kernel pagetables of the system
3 * so that we can see what the various memory ranges are set to.
5 * (C) Copyright 2008 Intel Corporation
7 * Author: Arjan van de Ven <arjan@linux.intel.com>
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; version 2
15 #include <linux/debugfs.h>
16 #include <linux/kasan.h>
18 #include <linux/init.h>
19 #include <linux/sched.h>
20 #include <linux/seq_file.h>
22 #include <asm/pgtable.h>
25 * The dumper groups pagetable entries of the same type into one, and for
26 * that it needs to keep some state when walking, and flush this state
27 * when a "break" in the continuity is found.
31 pgprot_t current_prot;
32 unsigned long start_address;
33 unsigned long current_address;
34 const struct addr_marker *marker;
38 unsigned long wx_pages;
42 unsigned long start_address;
44 unsigned long max_lines;
47 /* Address space markers hints */
51 enum address_markers_idx {
54 #ifdef CONFIG_MODIFY_LDT_SYSCALL
61 KASAN_SHADOW_START_NR,
65 #ifdef CONFIG_X86_ESPFIX64
78 static struct addr_marker address_markers[] = {
79 [USER_SPACE_NR] = { 0, "User Space" },
80 [KERNEL_SPACE_NR] = { (1UL << 63), "Kernel Space" },
81 [LOW_KERNEL_NR] = { 0UL, "Low Kernel Mapping" },
82 [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" },
83 [VMEMMAP_START_NR] = { 0UL, "Vmemmap" },
85 [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" },
86 [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" },
88 #ifdef CONFIG_MODIFY_LDT_SYSCALL
89 [LDT_NR] = { LDT_BASE_ADDR, "LDT remap" },
91 [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
92 #ifdef CONFIG_X86_ESPFIX64
93 [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
96 [EFI_END_NR] = { EFI_VA_END, "EFI Runtime Services" },
98 [HIGH_KERNEL_NR] = { __START_KERNEL_map, "High Kernel Mapping" },
99 [MODULES_VADDR_NR] = { MODULES_VADDR, "Modules" },
100 [MODULES_END_NR] = { MODULES_END, "End Modules" },
101 [FIXADDR_START_NR] = { FIXADDR_START, "Fixmap Area" },
102 [END_OF_SPACE_NR] = { -1, NULL }
105 #else /* CONFIG_X86_64 */
107 enum address_markers_idx {
112 #ifdef CONFIG_HIGHMEM
120 static struct addr_marker address_markers[] = {
121 [USER_SPACE_NR] = { 0, "User Space" },
122 [KERNEL_SPACE_NR] = { PAGE_OFFSET, "Kernel Mapping" },
123 [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" },
124 [VMALLOC_END_NR] = { 0UL, "vmalloc() End" },
125 #ifdef CONFIG_HIGHMEM
126 [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" },
128 [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" },
129 [FIXADDR_START_NR] = { 0UL, "Fixmap area" },
130 [END_OF_SPACE_NR] = { -1, NULL }
133 #endif /* !CONFIG_X86_64 */
135 /* Multipliers for offsets within the PTEs */
136 #define PTE_LEVEL_MULT (PAGE_SIZE)
137 #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
138 #define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
139 #define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
140 #define PGD_LEVEL_MULT (PTRS_PER_P4D * P4D_LEVEL_MULT)
142 #define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \
145 printk(KERN_INFO fmt, ##args); \
148 seq_printf(m, fmt, ##args); \
151 #define pt_dump_cont_printf(m, to_dmesg, fmt, args...) \
154 printk(KERN_CONT fmt, ##args); \
157 seq_printf(m, fmt, ##args); \
161 * Print a readable form of a pgprot_t to the seq_file
163 static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
165 pgprotval_t pr = pgprot_val(prot);
166 static const char * const level_name[] =
167 { "cr3", "pgd", "p4d", "pud", "pmd", "pte" };
169 if (!(pr & _PAGE_PRESENT)) {
171 pt_dump_cont_printf(m, dmsg, " ");
174 pt_dump_cont_printf(m, dmsg, "USR ");
176 pt_dump_cont_printf(m, dmsg, " ");
178 pt_dump_cont_printf(m, dmsg, "RW ");
180 pt_dump_cont_printf(m, dmsg, "ro ");
182 pt_dump_cont_printf(m, dmsg, "PWT ");
184 pt_dump_cont_printf(m, dmsg, " ");
186 pt_dump_cont_printf(m, dmsg, "PCD ");
188 pt_dump_cont_printf(m, dmsg, " ");
190 /* Bit 7 has a different meaning on level 3 vs 4 */
191 if (level <= 4 && pr & _PAGE_PSE)
192 pt_dump_cont_printf(m, dmsg, "PSE ");
194 pt_dump_cont_printf(m, dmsg, " ");
195 if ((level == 5 && pr & _PAGE_PAT) ||
196 ((level == 4 || level == 3) && pr & _PAGE_PAT_LARGE))
197 pt_dump_cont_printf(m, dmsg, "PAT ");
199 pt_dump_cont_printf(m, dmsg, " ");
200 if (pr & _PAGE_GLOBAL)
201 pt_dump_cont_printf(m, dmsg, "GLB ");
203 pt_dump_cont_printf(m, dmsg, " ");
205 pt_dump_cont_printf(m, dmsg, "NX ");
207 pt_dump_cont_printf(m, dmsg, "x ");
209 pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]);
213 * On 64 bits, sign-extend the 48 bit address to 64 bit
215 static unsigned long normalize_addr(unsigned long u)
218 if (!IS_ENABLED(CONFIG_X86_64))
221 shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
222 return (signed long)(u << shift) >> shift;
226 * This function gets called on a break in a continuous series
227 * of PTE entries; the next one is different so we need to
228 * print what we collected so far.
230 static void note_page(struct seq_file *m, struct pg_state *st,
231 pgprot_t new_prot, int level)
233 pgprotval_t prot, cur;
234 static const char units[] = "BKMGTPE";
237 * If we have a "break" in the series, we need to flush the state that
238 * we have now. "break" is either changing perms, levels or
239 * address space marker.
241 prot = pgprot_val(new_prot);
242 cur = pgprot_val(st->current_prot);
246 st->current_prot = new_prot;
248 st->marker = address_markers;
250 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
252 } else if (prot != cur || level != st->level ||
253 st->current_address >= st->marker[1].start_address) {
254 const char *unit = units;
256 int width = sizeof(unsigned long) * 2;
257 pgprotval_t pr = pgprot_val(st->current_prot);
259 if (st->check_wx && (pr & _PAGE_RW) && !(pr & _PAGE_NX)) {
261 "x86/mm: Found insecure W+X mapping at address %p/%pS\n",
262 (void *)st->start_address,
263 (void *)st->start_address);
264 st->wx_pages += (st->current_address -
265 st->start_address) / PAGE_SIZE;
269 * Now print the actual finished series
271 if (!st->marker->max_lines ||
272 st->lines < st->marker->max_lines) {
273 pt_dump_seq_printf(m, st->to_dmesg,
275 width, st->start_address,
276 width, st->current_address);
278 delta = st->current_address - st->start_address;
279 while (!(delta & 1023) && unit[1]) {
283 pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ",
285 printk_prot(m, st->current_prot, st->level,
291 * We print markers for special areas of address space,
292 * such as the start of vmalloc space etc.
293 * This helps in the interpretation.
295 if (st->current_address >= st->marker[1].start_address) {
296 if (st->marker->max_lines &&
297 st->lines > st->marker->max_lines) {
298 unsigned long nskip =
299 st->lines - st->marker->max_lines;
300 pt_dump_seq_printf(m, st->to_dmesg,
301 "... %lu entr%s skipped ... \n",
303 nskip == 1 ? "y" : "ies");
307 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
311 st->start_address = st->current_address;
312 st->current_prot = new_prot;
317 static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, unsigned long P)
323 start = (pte_t *)pmd_page_vaddr(addr);
324 for (i = 0; i < PTRS_PER_PTE; i++) {
325 prot = pte_flags(*start);
326 st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
327 note_page(m, st, __pgprot(prot), 5);
334 * This is an optimization for KASAN=y case. Since all kasan page tables
335 * eventually point to the kasan_zero_page we could call note_page()
336 * right away without walking through lower level page tables. This saves
337 * us dozens of seconds (minutes for 5-level config) while checking for
338 * W+X mapping or reading kernel_page_tables debugfs file.
340 static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
343 if (__pa(pt) == __pa(kasan_zero_pmd) ||
344 #ifdef CONFIG_X86_5LEVEL
345 __pa(pt) == __pa(kasan_zero_p4d) ||
347 __pa(pt) == __pa(kasan_zero_pud)) {
348 pgprotval_t prot = pte_flags(kasan_zero_pte[0]);
349 note_page(m, st, __pgprot(prot), 5);
355 static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
364 static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P)
367 pmd_t *start, *pmd_start;
370 pmd_start = start = (pmd_t *)pud_page_vaddr(addr);
371 for (i = 0; i < PTRS_PER_PMD; i++) {
372 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
373 if (!pmd_none(*start)) {
374 if (pmd_large(*start) || !pmd_present(*start)) {
375 prot = pmd_flags(*start);
376 note_page(m, st, __pgprot(prot), 4);
377 } else if (!kasan_page_table(m, st, pmd_start)) {
378 walk_pte_level(m, st, *start,
379 P + i * PMD_LEVEL_MULT);
382 note_page(m, st, __pgprot(0), 4);
388 #define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p)
389 #define pud_large(a) pmd_large(__pmd(pud_val(a)))
390 #define pud_none(a) pmd_none(__pmd(pud_val(a)))
395 static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P)
398 pud_t *start, *pud_start;
400 pud_t *prev_pud = NULL;
402 pud_start = start = (pud_t *)p4d_page_vaddr(addr);
404 for (i = 0; i < PTRS_PER_PUD; i++) {
405 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
406 if (!pud_none(*start)) {
407 if (pud_large(*start) || !pud_present(*start)) {
408 prot = pud_flags(*start);
409 note_page(m, st, __pgprot(prot), 3);
410 } else if (!kasan_page_table(m, st, pud_start)) {
411 walk_pmd_level(m, st, *start,
412 P + i * PUD_LEVEL_MULT);
415 note_page(m, st, __pgprot(0), 3);
423 #define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(p4d_val(a)),p)
424 #define p4d_large(a) pud_large(__pud(p4d_val(a)))
425 #define p4d_none(a) pud_none(__pud(p4d_val(a)))
430 static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P)
433 p4d_t *start, *p4d_start;
436 p4d_start = start = (p4d_t *)pgd_page_vaddr(addr);
438 for (i = 0; i < PTRS_PER_P4D; i++) {
439 st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
440 if (!p4d_none(*start)) {
441 if (p4d_large(*start) || !p4d_present(*start)) {
442 prot = p4d_flags(*start);
443 note_page(m, st, __pgprot(prot), 2);
444 } else if (!kasan_page_table(m, st, p4d_start)) {
445 walk_pud_level(m, st, *start,
446 P + i * P4D_LEVEL_MULT);
449 note_page(m, st, __pgprot(0), 2);
456 #define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p)
457 #define pgd_large(a) p4d_large(__p4d(pgd_val(a)))
458 #define pgd_none(a) p4d_none(__p4d(pgd_val(a)))
461 static inline bool is_hypervisor_range(int idx)
465 * A hole in the beginning of kernel address space reserved
468 return (idx >= pgd_index(GUARD_HOLE_BASE_ADDR)) &&
469 (idx < pgd_index(GUARD_HOLE_END_ADDR));
475 static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
476 bool checkwx, bool dmesg)
479 pgd_t *start = (pgd_t *) &init_top_pgt;
481 pgd_t *start = swapper_pg_dir;
485 struct pg_state st = {};
492 st.check_wx = checkwx;
496 for (i = 0; i < PTRS_PER_PGD; i++) {
497 st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
498 if (!pgd_none(*start) && !is_hypervisor_range(i)) {
499 if (pgd_large(*start) || !pgd_present(*start)) {
500 prot = pgd_flags(*start);
501 note_page(m, &st, __pgprot(prot), 1);
503 walk_p4d_level(m, &st, *start,
507 note_page(m, &st, __pgprot(0), 1);
513 /* Flush out the last page */
514 st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
515 note_page(m, &st, __pgprot(0), 0);
519 pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n",
522 pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n");
525 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
527 ptdump_walk_pgd_level_core(m, pgd, false, true);
530 void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
532 #ifdef CONFIG_PAGE_TABLE_ISOLATION
533 if (user && static_cpu_has(X86_FEATURE_PTI))
534 pgd = kernel_to_user_pgdp(pgd);
536 ptdump_walk_pgd_level_core(m, pgd, false, false);
538 EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
540 static void ptdump_walk_user_pgd_level_checkwx(void)
542 #ifdef CONFIG_PAGE_TABLE_ISOLATION
543 pgd_t *pgd = (pgd_t *) &init_top_pgt;
545 if (!static_cpu_has(X86_FEATURE_PTI))
548 pr_info("x86/mm: Checking user space page tables\n");
549 pgd = kernel_to_user_pgdp(pgd);
550 ptdump_walk_pgd_level_core(NULL, pgd, true, false);
554 void ptdump_walk_pgd_level_checkwx(void)
556 ptdump_walk_pgd_level_core(NULL, NULL, true, false);
557 ptdump_walk_user_pgd_level_checkwx();
560 static int __init pt_dump_init(void)
563 * Various markers are not compile-time constants, so assign them
567 address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET;
568 address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
569 address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START;
572 address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
573 address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
574 # ifdef CONFIG_HIGHMEM
575 address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
577 address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
578 address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE;
582 __initcall(pt_dump_init);