1 // SPDX-License-Identifier: GPL-2.0-only
3 * Interface for exporting the OPAL ELF core.
4 * Heavily inspired from fs/proc/vmcore.c
6 * Copyright 2019, Hari Bathini, IBM Corporation.
9 #define pr_fmt(fmt) "opal core: " fmt
11 #include <linux/memblock.h>
12 #include <linux/uaccess.h>
13 #include <linux/proc_fs.h>
14 #include <linux/elf.h>
15 #include <linux/elfcore.h>
16 #include <linux/kobject.h>
17 #include <linux/sysfs.h>
18 #include <linux/slab.h>
19 #include <linux/crash_core.h>
24 #include <asm/fadump-internal.h>
26 #include "opal-fadump.h"
28 #define MAX_PT_LOAD_CNT 8
30 /* NT_AUXV note related info */
32 #define AUXV_DESC_SZ (((2 * AUXV_CNT) + 1) * sizeof(Elf64_Off))
34 struct opalcore_config {
36 /* PIR value of crashing CPU */
39 /* CPU state data info from F/W */
40 u64 cpu_state_destination_vaddr;
41 u64 cpu_state_data_size;
42 u64 cpu_state_entry_size;
44 /* OPAL memory to be exported as PT_LOAD segments */
45 u64 ptload_addr[MAX_PT_LOAD_CNT];
46 u64 ptload_size[MAX_PT_LOAD_CNT];
49 /* Pointer to the first PT_LOAD in the ELF core file */
50 Elf64_Phdr *ptload_phdr;
52 /* Total size of opalcore file. */
55 /* Buffer for all the ELF core headers and the PT_NOTE */
56 size_t opalcorebuf_sz;
60 char auxv_buf[AUXV_DESC_SZ];
64 struct list_head list;
70 static LIST_HEAD(opalcore_list);
71 static struct opalcore_config *oc_conf;
72 static const struct opal_mpipl_fadump *opalc_metadata;
73 static const struct opal_mpipl_fadump *opalc_cpu_metadata;
74 static struct kobject *mpipl_kobj;
77 * Set crashing CPU's signal to SIGUSR1. if the kernel is triggered
78 * by kernel, SIGTERM otherwise.
80 bool kernel_initiated;
82 static struct opalcore * __init get_new_element(void)
84 return kzalloc(sizeof(struct opalcore), GFP_KERNEL);
87 static inline int is_opalcore_usable(void)
89 return (oc_conf && oc_conf->opalcorebuf != NULL) ? 1 : 0;
92 static Elf64_Word *__init append_elf64_note(Elf64_Word *buf, char *name,
96 Elf64_Nhdr *note = (Elf64_Nhdr *)buf;
97 Elf64_Word namesz = strlen(name) + 1;
99 note->n_namesz = cpu_to_be32(namesz);
100 note->n_descsz = cpu_to_be32(data_len);
101 note->n_type = cpu_to_be32(type);
102 buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf64_Word));
103 memcpy(buf, name, namesz);
104 buf += DIV_ROUND_UP(namesz, sizeof(Elf64_Word));
105 memcpy(buf, data, data_len);
106 buf += DIV_ROUND_UP(data_len, sizeof(Elf64_Word));
111 static void __init fill_prstatus(struct elf_prstatus *prstatus, int pir,
112 struct pt_regs *regs)
114 memset(prstatus, 0, sizeof(struct elf_prstatus));
115 elf_core_copy_regs(&(prstatus->pr_reg), regs);
118 * Overload PID with PIR value.
119 * As a PIR value could also be '0', add an offset of '100'
120 * to every PIR to avoid misinterpretations in GDB.
122 prstatus->common.pr_pid = cpu_to_be32(100 + pir);
123 prstatus->common.pr_ppid = cpu_to_be32(1);
126 * Indicate SIGUSR1 for crash initiated from kernel.
129 if (pir == oc_conf->crashing_cpu) {
132 sig = kernel_initiated ? SIGUSR1 : SIGTERM;
133 prstatus->common.pr_cursig = cpu_to_be16(sig);
137 static Elf64_Word *__init auxv_to_elf64_notes(Elf64_Word *buf,
140 Elf64_Off *bufp = (Elf64_Off *)oc_conf->auxv_buf;
143 memset(bufp, 0, AUXV_DESC_SZ);
145 /* Entry point of OPAL */
146 bufp[idx++] = cpu_to_be64(AT_ENTRY);
147 bufp[idx++] = cpu_to_be64(opal_boot_entry);
150 bufp[idx++] = cpu_to_be64(AT_NULL);
152 buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_AUXV,
153 oc_conf->auxv_buf, AUXV_DESC_SZ);
158 * Read from the ELF header and then the crash dump.
159 * Returns number of bytes read on success, -errno on failure.
161 static ssize_t read_opalcore(struct file *file, struct kobject *kobj,
162 struct bin_attribute *bin_attr, char *to,
163 loff_t pos, size_t count)
169 if (pos >= oc_conf->opalcore_size)
172 /* Adjust count if it goes beyond opalcore size */
173 avail = oc_conf->opalcore_size - pos;
180 /* Read ELF core header and/or PT_NOTE segment */
181 if (tpos < oc_conf->opalcorebuf_sz) {
182 tsz = min_t(size_t, oc_conf->opalcorebuf_sz - tpos, count);
183 memcpy(to, oc_conf->opalcorebuf + tpos, tsz);
189 list_for_each_entry(m, &opalcore_list, list) {
190 /* nothing more to read here */
194 if (tpos < m->offset + m->size) {
197 tsz = min_t(size_t, m->offset + m->size - tpos, count);
198 addr = (void *)(m->paddr + tpos - m->offset);
199 memcpy(to, __va(addr), tsz);
209 static struct bin_attribute opal_core_attr = {
210 .attr = {.name = "core", .mode = 0400},
211 .read = read_opalcore
215 * Read CPU state dump data and convert it into ELF notes.
217 * Each register entry is of 16 bytes, A numerical identifier along with
218 * a GPR/SPR flag in the first 8 bytes and the register value in the next
219 * 8 bytes. For more details refer to F/W documentation.
221 static Elf64_Word * __init opalcore_append_cpu_notes(Elf64_Word *buf)
223 u32 thread_pir, size_per_thread, regs_offset, regs_cnt, reg_esize;
224 struct hdat_fadump_thread_hdr *thdr;
225 struct elf_prstatus prstatus;
226 Elf64_Word *first_cpu_note;
231 size_per_thread = oc_conf->cpu_state_entry_size;
232 bufp = __va(oc_conf->cpu_state_destination_vaddr);
235 * Offset for register entries, entry size and registers count is
236 * duplicated in every thread header in keeping with HDAT format.
237 * Use these values from the first thread header.
239 thdr = (struct hdat_fadump_thread_hdr *)bufp;
240 regs_offset = (offsetof(struct hdat_fadump_thread_hdr, offset) +
241 be32_to_cpu(thdr->offset));
242 reg_esize = be32_to_cpu(thdr->esize);
243 regs_cnt = be32_to_cpu(thdr->ecnt);
245 pr_debug("--------CPU State Data------------\n");
246 pr_debug("NumCpus : %u\n", oc_conf->num_cpus);
247 pr_debug("\tOffset: %u, Entry size: %u, Cnt: %u\n",
248 regs_offset, reg_esize, regs_cnt);
251 * Skip past the first CPU note. Fill this note with the
252 * crashing CPU's prstatus.
254 first_cpu_note = buf;
255 buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS,
256 &prstatus, sizeof(prstatus));
258 for (i = 0; i < oc_conf->num_cpus; i++, bufp += size_per_thread) {
259 thdr = (struct hdat_fadump_thread_hdr *)bufp;
260 thread_pir = be32_to_cpu(thdr->pir);
262 pr_debug("[%04d] PIR: 0x%x, core state: 0x%02x\n",
263 i, thread_pir, thdr->core_state);
266 * Register state data of MAX cores is provided by firmware,
267 * but some of this cores may not be active. So, while
268 * processing register state data, check core state and
269 * skip threads that belong to inactive cores.
271 if (thdr->core_state == HDAT_FADUMP_CORE_INACTIVE)
274 opal_fadump_read_regs((bufp + regs_offset), regs_cnt,
275 reg_esize, false, ®s);
277 pr_debug("PIR 0x%x - R1 : 0x%llx, NIP : 0x%llx\n", thread_pir,
278 be64_to_cpu(regs.gpr[1]), be64_to_cpu(regs.nip));
279 fill_prstatus(&prstatus, thread_pir, ®s);
281 if (thread_pir != oc_conf->crashing_cpu) {
282 buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME,
283 NT_PRSTATUS, &prstatus,
287 * Add crashing CPU as the first NT_PRSTATUS note for
288 * GDB to process the core file appropriately.
290 append_elf64_note(first_cpu_note, CRASH_CORE_NOTE_NAME,
291 NT_PRSTATUS, &prstatus,
299 static int __init create_opalcore(void)
301 u64 opal_boot_entry, opal_base_addr, paddr;
302 u32 hdr_size, cpu_notes_size, count;
303 struct device_node *dn;
304 struct opalcore *new;
312 /* Get size of header & CPU notes for OPAL core */
313 hdr_size = (sizeof(Elf64_Ehdr) +
314 ((oc_conf->ptload_cnt + 1) * sizeof(Elf64_Phdr)));
315 cpu_notes_size = ((oc_conf->num_cpus * (CRASH_CORE_NOTE_HEAD_BYTES +
316 CRASH_CORE_NOTE_NAME_BYTES +
317 CRASH_CORE_NOTE_DESC_BYTES)) +
318 (CRASH_CORE_NOTE_HEAD_BYTES +
319 CRASH_CORE_NOTE_NAME_BYTES + AUXV_DESC_SZ));
321 /* Allocate buffer to setup OPAL core */
322 oc_conf->opalcorebuf_sz = PAGE_ALIGN(hdr_size + cpu_notes_size);
323 oc_conf->opalcorebuf = alloc_pages_exact(oc_conf->opalcorebuf_sz,
324 GFP_KERNEL | __GFP_ZERO);
325 if (!oc_conf->opalcorebuf) {
326 pr_err("Not enough memory to setup OPAL core (size: %lu)\n",
327 oc_conf->opalcorebuf_sz);
328 oc_conf->opalcorebuf_sz = 0;
331 count = oc_conf->opalcorebuf_sz / PAGE_SIZE;
332 page = virt_to_page(oc_conf->opalcorebuf);
333 for (i = 0; i < count; i++)
334 mark_page_reserved(page + i);
336 pr_debug("opalcorebuf = 0x%llx\n", (u64)oc_conf->opalcorebuf);
338 /* Read OPAL related device-tree entries */
339 dn = of_find_node_by_name(NULL, "ibm,opal");
341 ret = of_property_read_u64(dn, "opal-base-address",
343 pr_debug("opal-base-address: %llx\n", opal_base_addr);
344 ret |= of_property_read_u64(dn, "opal-boot-address",
346 pr_debug("opal-boot-address: %llx\n", opal_boot_entry);
349 pr_warn("WARNING: Failed to read OPAL base & entry values\n");
353 /* Use count to keep track of the program headers */
356 bufp = oc_conf->opalcorebuf;
357 elf = (Elf64_Ehdr *)bufp;
358 bufp += sizeof(Elf64_Ehdr);
359 memcpy(elf->e_ident, ELFMAG, SELFMAG);
360 elf->e_ident[EI_CLASS] = ELF_CLASS;
361 elf->e_ident[EI_DATA] = ELFDATA2MSB;
362 elf->e_ident[EI_VERSION] = EV_CURRENT;
363 elf->e_ident[EI_OSABI] = ELF_OSABI;
364 memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
365 elf->e_type = cpu_to_be16(ET_CORE);
366 elf->e_machine = cpu_to_be16(ELF_ARCH);
367 elf->e_version = cpu_to_be32(EV_CURRENT);
369 elf->e_phoff = cpu_to_be64(sizeof(Elf64_Ehdr));
373 elf->e_ehsize = cpu_to_be16(sizeof(Elf64_Ehdr));
374 elf->e_phentsize = cpu_to_be16(sizeof(Elf64_Phdr));
376 elf->e_shentsize = 0;
380 phdr = (Elf64_Phdr *)bufp;
381 bufp += sizeof(Elf64_Phdr);
382 phdr->p_type = cpu_to_be32(PT_NOTE);
385 phdr->p_paddr = phdr->p_vaddr = 0;
386 phdr->p_offset = cpu_to_be64(hdr_size);
387 phdr->p_filesz = phdr->p_memsz = cpu_to_be64(cpu_notes_size);
390 opalcore_off = oc_conf->opalcorebuf_sz;
391 oc_conf->ptload_phdr = (Elf64_Phdr *)bufp;
393 for (i = 0; i < oc_conf->ptload_cnt; i++) {
394 phdr = (Elf64_Phdr *)bufp;
395 bufp += sizeof(Elf64_Phdr);
396 phdr->p_type = cpu_to_be32(PT_LOAD);
397 phdr->p_flags = cpu_to_be32(PF_R|PF_W|PF_X);
400 new = get_new_element();
403 new->paddr = oc_conf->ptload_addr[i];
404 new->size = oc_conf->ptload_size[i];
405 new->offset = opalcore_off;
406 list_add_tail(&new->list, &opalcore_list);
408 phdr->p_paddr = cpu_to_be64(paddr);
409 phdr->p_vaddr = cpu_to_be64(opal_base_addr + paddr);
410 phdr->p_filesz = phdr->p_memsz =
411 cpu_to_be64(oc_conf->ptload_size[i]);
412 phdr->p_offset = cpu_to_be64(opalcore_off);
415 opalcore_off += oc_conf->ptload_size[i];
416 paddr += oc_conf->ptload_size[i];
419 elf->e_phnum = cpu_to_be16(count);
421 bufp = (char *)opalcore_append_cpu_notes((Elf64_Word *)bufp);
422 bufp = (char *)auxv_to_elf64_notes((Elf64_Word *)bufp, opal_boot_entry);
424 oc_conf->opalcore_size = opalcore_off;
428 static void opalcore_cleanup(void)
433 /* Remove OPAL core sysfs file */
434 sysfs_remove_bin_file(mpipl_kobj, &opal_core_attr);
435 oc_conf->ptload_phdr = NULL;
436 oc_conf->ptload_cnt = 0;
438 /* free the buffer used for setting up OPAL core */
439 if (oc_conf->opalcorebuf) {
440 void *end = (void *)((u64)oc_conf->opalcorebuf +
441 oc_conf->opalcorebuf_sz);
443 free_reserved_area(oc_conf->opalcorebuf, end, -1, NULL);
444 oc_conf->opalcorebuf = NULL;
445 oc_conf->opalcorebuf_sz = 0;
451 __exitcall(opalcore_cleanup);
453 static void __init opalcore_config_init(void)
455 u32 idx, cpu_data_version;
456 struct device_node *np;
461 np = of_find_node_by_path("/ibm,opal/dump");
465 if (!of_device_is_compatible(np, "ibm,opal-dump")) {
466 pr_warn("Support missing for this f/w version!\n");
470 /* Check if dump has been initiated on last reboot */
471 prop = of_get_property(np, "mpipl-boot", NULL);
477 /* Get OPAL metadata */
478 ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_OPAL, &addr);
479 if ((ret != OPAL_SUCCESS) || !addr) {
480 pr_err("Failed to get OPAL metadata (%d)\n", ret);
484 addr = be64_to_cpu(addr);
485 pr_debug("OPAL metadata addr: %llx\n", addr);
486 opalc_metadata = __va(addr);
488 /* Get OPAL CPU metadata */
489 ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &addr);
490 if ((ret != OPAL_SUCCESS) || !addr) {
491 pr_err("Failed to get OPAL CPU metadata (%d)\n", ret);
495 addr = be64_to_cpu(addr);
496 pr_debug("CPU metadata addr: %llx\n", addr);
497 opalc_cpu_metadata = __va(addr);
499 /* Allocate memory for config buffer */
500 oc_conf = kzalloc(sizeof(struct opalcore_config), GFP_KERNEL);
504 /* Parse OPAL metadata */
505 if (opalc_metadata->version != OPAL_MPIPL_VERSION) {
506 pr_warn("Supported OPAL metadata version: %u, found: %u!\n",
507 OPAL_MPIPL_VERSION, opalc_metadata->version);
508 pr_warn("WARNING: F/W using newer OPAL metadata format!!\n");
511 oc_conf->ptload_cnt = 0;
512 idx = be32_to_cpu(opalc_metadata->region_cnt);
513 if (idx > MAX_PT_LOAD_CNT) {
514 pr_warn("WARNING: OPAL regions count (%d) adjusted to limit (%d)",
515 idx, MAX_PT_LOAD_CNT);
516 idx = MAX_PT_LOAD_CNT;
518 for (i = 0; i < idx; i++) {
519 oc_conf->ptload_addr[oc_conf->ptload_cnt] =
520 be64_to_cpu(opalc_metadata->region[i].dest);
521 oc_conf->ptload_size[oc_conf->ptload_cnt++] =
522 be64_to_cpu(opalc_metadata->region[i].size);
524 oc_conf->ptload_cnt = i;
525 oc_conf->crashing_cpu = be32_to_cpu(opalc_metadata->crashing_pir);
527 if (!oc_conf->ptload_cnt) {
528 pr_err("OPAL memory regions not found\n");
532 /* Parse OPAL CPU metadata */
533 cpu_data_version = be32_to_cpu(opalc_cpu_metadata->cpu_data_version);
534 if (cpu_data_version != HDAT_FADUMP_CPU_DATA_VER) {
535 pr_warn("Supported CPU data version: %u, found: %u!\n",
536 HDAT_FADUMP_CPU_DATA_VER, cpu_data_version);
537 pr_warn("WARNING: F/W using newer CPU state data format!!\n");
540 addr = be64_to_cpu(opalc_cpu_metadata->region[0].dest);
542 pr_err("CPU state data not found!\n");
545 oc_conf->cpu_state_destination_vaddr = (u64)__va(addr);
547 oc_conf->cpu_state_data_size =
548 be64_to_cpu(opalc_cpu_metadata->region[0].size);
549 oc_conf->cpu_state_entry_size =
550 be32_to_cpu(opalc_cpu_metadata->cpu_data_size);
552 if ((oc_conf->cpu_state_entry_size == 0) ||
553 (oc_conf->cpu_state_entry_size > oc_conf->cpu_state_data_size)) {
554 pr_err("CPU state data is invalid.\n");
557 oc_conf->num_cpus = (oc_conf->cpu_state_data_size /
558 oc_conf->cpu_state_entry_size);
564 pr_err("Could not export /sys/firmware/opal/core\n");
569 static ssize_t release_core_store(struct kobject *kobj,
570 struct kobj_attribute *attr,
571 const char *buf, size_t count)
575 if (kstrtoint(buf, 0, &input))
579 if (oc_conf == NULL) {
580 pr_err("'/sys/firmware/opal/core' file not accessible!\n");
585 * Take away '/sys/firmware/opal/core' and release all memory
586 * used for exporting this file.
595 static struct kobj_attribute opalcore_rel_attr = __ATTR_WO(release_core);
597 static struct attribute *mpipl_attr[] = {
598 &opalcore_rel_attr.attr,
602 static struct bin_attribute *mpipl_bin_attr[] = {
608 static const struct attribute_group mpipl_group = {
610 .bin_attrs = mpipl_bin_attr,
613 static int __init opalcore_init(void)
617 opalcore_config_init();
625 * If oc_conf->opalcorebuf= is set in the 2nd kernel,
626 * then capture the dump.
628 if (!(is_opalcore_usable())) {
629 pr_err("Failed to export /sys/firmware/opal/mpipl/core\n");
634 /* Set OPAL core file size */
635 opal_core_attr.size = oc_conf->opalcore_size;
637 mpipl_kobj = kobject_create_and_add("mpipl", opal_kobj);
639 pr_err("unable to create mpipl kobject\n");
643 /* Export OPAL core sysfs file */
644 rc = sysfs_create_group(mpipl_kobj, &mpipl_group);
646 pr_err("mpipl sysfs group creation failed (%d)", rc);
650 /* The /sys/firmware/opal/core is moved to /sys/firmware/opal/mpipl/
651 * directory, need to create symlink at old location to maintain
652 * backward compatibility.
654 rc = compat_only_sysfs_link_entry_to_kobj(opal_kobj, mpipl_kobj,
657 pr_err("unable to create core symlink (%d)\n", rc);
663 fs_initcall(opalcore_init);