GNU Linux-libre 5.4.274-gnu1
[releases.git] / arch / powerpc / platforms / powernv / opal-hmi.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * OPAL hypervisor Maintenance interrupt handling support in PowerNV.
4  *
5  * Copyright 2014 IBM Corporation
6  * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
7  */
8
9 #undef DEBUG
10
11 #include <linux/kernel.h>
12 #include <linux/init.h>
13 #include <linux/of.h>
14 #include <linux/mm.h>
15 #include <linux/slab.h>
16
17 #include <asm/opal.h>
18 #include <asm/cputable.h>
19 #include <asm/machdep.h>
20
21 #include "powernv.h"
22
23 static int opal_hmi_handler_nb_init;
24 struct OpalHmiEvtNode {
25         struct list_head list;
26         struct OpalHMIEvent hmi_evt;
27 };
28
29 struct xstop_reason {
30         uint32_t xstop_reason;
31         const char *unit_failed;
32         const char *description;
33 };
34
35 static LIST_HEAD(opal_hmi_evt_list);
36 static DEFINE_SPINLOCK(opal_hmi_evt_lock);
37
38 static void print_core_checkstop_reason(const char *level,
39                                         struct OpalHMIEvent *hmi_evt)
40 {
41         int i;
42         static const struct xstop_reason xstop_reason[] = {
43                 { CORE_CHECKSTOP_IFU_REGFILE, "IFU",
44                                 "RegFile core check stop" },
45                 { CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" },
46                 { CORE_CHECKSTOP_PC_DURING_RECOV, "PC",
47                                 "Core checkstop during recovery" },
48                 { CORE_CHECKSTOP_ISU_REGFILE, "ISU",
49                                 "RegFile core check stop (mapper error)" },
50                 { CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" },
51                 { CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" },
52                 { CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" },
53                 { CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC",
54                                 "Recovery in maintenance mode" },
55                 { CORE_CHECKSTOP_LSU_REGFILE, "LSU",
56                                 "RegFile core check stop" },
57                 { CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC",
58                                 "Forward Progress Error" },
59                 { CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" },
60                 { CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" },
61                 { CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC",
62                                 "Hypervisor Resource error - core check stop" },
63                 { CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC",
64                                 "Hang Recovery Failed (core check stop)" },
65                 { CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC",
66                                 "Ambiguous Hang Detected (unknown source)" },
67                 { CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC",
68                                 "Debug Trigger Error inject" },
69                 { CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC",
70                                 "Hypervisor check stop via SPRC/SPRD" },
71         };
72
73         /* Validity check */
74         if (!hmi_evt->u.xstop_error.xstop_reason) {
75                 printk("%s      Unknown Core check stop.\n", level);
76                 return;
77         }
78
79         printk("%s      CPU PIR: %08x\n", level,
80                         be32_to_cpu(hmi_evt->u.xstop_error.u.pir));
81         for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
82                 if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
83                                         xstop_reason[i].xstop_reason)
84                         printk("%s      [Unit: %-3s] %s\n", level,
85                                         xstop_reason[i].unit_failed,
86                                         xstop_reason[i].description);
87 }
88
89 static void print_nx_checkstop_reason(const char *level,
90                                         struct OpalHMIEvent *hmi_evt)
91 {
92         int i;
93         static const struct xstop_reason xstop_reason[] = {
94                 { NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine",
95                                         "SHM invalid state error" },
96                 { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine",
97                                         "DMA invalid state error bit 15" },
98                 { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine",
99                                         "DMA invalid state error bit 16" },
100                 { NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine",
101                                         "Channel 0 invalid state error" },
102                 { NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine",
103                                         "Channel 1 invalid state error" },
104                 { NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine",
105                                         "Channel 2 invalid state error" },
106                 { NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine",
107                                         "Channel 3 invalid state error" },
108                 { NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine",
109                                         "Channel 4 invalid state error" },
110                 { NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine",
111                                         "Channel 5 invalid state error" },
112                 { NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine",
113                                         "Channel 6 invalid state error" },
114                 { NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine",
115                                         "Channel 7 invalid state error" },
116                 { NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine",
117                                         "UE error on CRB(CSB address, CCB)" },
118                 { NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine",
119                                         "SUE error on CRB(CSB address, CCB)" },
120                 { NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface",
121                 "CRB Kill ISN received while holding ISN with UE error" },
122         };
123
124         /* Validity check */
125         if (!hmi_evt->u.xstop_error.xstop_reason) {
126                 printk("%s      Unknown NX check stop.\n", level);
127                 return;
128         }
129
130         printk("%s      NX checkstop on CHIP ID: %x\n", level,
131                         be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
132         for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
133                 if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
134                                         xstop_reason[i].xstop_reason)
135                         printk("%s      [Unit: %-3s] %s\n", level,
136                                         xstop_reason[i].unit_failed,
137                                         xstop_reason[i].description);
138 }
139
140 static void print_npu_checkstop_reason(const char *level,
141                                         struct OpalHMIEvent *hmi_evt)
142 {
143         uint8_t reason, reason_count, i;
144
145         /*
146          * We may not have a checkstop reason on some combination of
147          * hardware and/or skiboot version
148          */
149         if (!hmi_evt->u.xstop_error.xstop_reason) {
150                 printk("%s      NPU checkstop on chip %x\n", level,
151                         be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
152                 return;
153         }
154
155         /*
156          * NPU2 has 3 FIRs. Reason encoded on a byte as:
157          *   2 bits for the FIR number
158          *   6 bits for the bit number
159          * It may be possible to find several reasons.
160          *
161          * We don't display a specific message per FIR bit as there
162          * are too many and most are meaningless without the workbook
163          * and/or hw team help anyway.
164          */
165         reason_count = sizeof(hmi_evt->u.xstop_error.xstop_reason) /
166                 sizeof(reason);
167         for (i = 0; i < reason_count; i++) {
168                 reason = (hmi_evt->u.xstop_error.xstop_reason >> (8 * i)) & 0xFF;
169                 if (reason)
170                         printk("%s      NPU checkstop on chip %x: FIR%d bit %d is set\n",
171                                 level,
172                                 be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id),
173                                 reason >> 6, reason & 0x3F);
174         }
175 }
176
177 static void print_checkstop_reason(const char *level,
178                                         struct OpalHMIEvent *hmi_evt)
179 {
180         uint8_t type = hmi_evt->u.xstop_error.xstop_type;
181         switch (type) {
182         case CHECKSTOP_TYPE_CORE:
183                 print_core_checkstop_reason(level, hmi_evt);
184                 break;
185         case CHECKSTOP_TYPE_NX:
186                 print_nx_checkstop_reason(level, hmi_evt);
187                 break;
188         case CHECKSTOP_TYPE_NPU:
189                 print_npu_checkstop_reason(level, hmi_evt);
190                 break;
191         default:
192                 printk("%s      Unknown Malfunction Alert of type %d\n",
193                        level, type);
194                 break;
195         }
196 }
197
198 static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
199 {
200         const char *level, *sevstr, *error_info;
201         static const char *hmi_error_types[] = {
202                 "Malfunction Alert",
203                 "Processor Recovery done",
204                 "Processor recovery occurred again",
205                 "Processor recovery occurred for masked error",
206                 "Timer facility experienced an error",
207                 "TFMR SPR is corrupted",
208                 "UPS (Uninterrupted Power System) Overflow indication",
209                 "An XSCOM operation failure",
210                 "An XSCOM operation completed",
211                 "SCOM has set a reserved FIR bit to cause recovery",
212                 "Debug trigger has set a reserved FIR bit to cause recovery",
213                 "A hypervisor resource error occurred",
214                 "CAPP recovery process is in progress",
215         };
216
217         /* Print things out */
218         if (hmi_evt->version < OpalHMIEvt_V1) {
219                 pr_err("HMI Interrupt, Unknown event version %d !\n",
220                         hmi_evt->version);
221                 return;
222         }
223         switch (hmi_evt->severity) {
224         case OpalHMI_SEV_NO_ERROR:
225                 level = KERN_INFO;
226                 sevstr = "Harmless";
227                 break;
228         case OpalHMI_SEV_WARNING:
229                 level = KERN_WARNING;
230                 sevstr = "";
231                 break;
232         case OpalHMI_SEV_ERROR_SYNC:
233                 level = KERN_ERR;
234                 sevstr = "Severe";
235                 break;
236         case OpalHMI_SEV_FATAL:
237         default:
238                 level = KERN_ERR;
239                 sevstr = "Fatal";
240                 break;
241         }
242
243         printk("%s%s Hypervisor Maintenance interrupt [%s]\n",
244                 level, sevstr,
245                 hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?
246                 "Recovered" : "Not recovered");
247         error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?
248                         hmi_error_types[hmi_evt->type]
249                         : "Unknown";
250         printk("%s Error detail: %s\n", level, error_info);
251         printk("%s      HMER: %016llx\n", level, be64_to_cpu(hmi_evt->hmer));
252         if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||
253                 (hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
254                 printk("%s      TFMR: %016llx\n", level,
255                                                 be64_to_cpu(hmi_evt->tfmr));
256
257         if (hmi_evt->version < OpalHMIEvt_V2)
258                 return;
259
260         /* OpalHMIEvt_V2 and above provides reason for malfunction alert. */
261         if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT)
262                 print_checkstop_reason(level, hmi_evt);
263 }
264
265 static void hmi_event_handler(struct work_struct *work)
266 {
267         unsigned long flags;
268         struct OpalHMIEvent *hmi_evt;
269         struct OpalHmiEvtNode *msg_node;
270         uint8_t disposition;
271         struct opal_msg msg;
272         int unrecoverable = 0;
273
274         spin_lock_irqsave(&opal_hmi_evt_lock, flags);
275         while (!list_empty(&opal_hmi_evt_list)) {
276                 msg_node = list_entry(opal_hmi_evt_list.next,
277                                            struct OpalHmiEvtNode, list);
278                 list_del(&msg_node->list);
279                 spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
280
281                 hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt;
282                 print_hmi_event_info(hmi_evt);
283                 disposition = hmi_evt->disposition;
284                 kfree(msg_node);
285
286                 /*
287                  * Check if HMI event has been recovered or not. If not
288                  * then kernel can't continue, we need to panic.
289                  * But before we do that, display all the HMI event
290                  * available on the list and set unrecoverable flag to 1.
291                  */
292                 if (disposition != OpalHMI_DISPOSITION_RECOVERED)
293                         unrecoverable = 1;
294
295                 spin_lock_irqsave(&opal_hmi_evt_lock, flags);
296         }
297         spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
298
299         if (unrecoverable) {
300                 /* Pull all HMI events from OPAL before we panic. */
301                 while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) {
302                         u32 type;
303
304                         type = be32_to_cpu(msg.msg_type);
305
306                         /* skip if not HMI event */
307                         if (type != OPAL_MSG_HMI_EVT)
308                                 continue;
309
310                         /* HMI event info starts from param[0] */
311                         hmi_evt = (struct OpalHMIEvent *)&msg.params[0];
312                         print_hmi_event_info(hmi_evt);
313                 }
314
315                 pnv_platform_error_reboot(NULL, "Unrecoverable HMI exception");
316         }
317 }
318
319 static DECLARE_WORK(hmi_event_work, hmi_event_handler);
320 /*
321  * opal_handle_hmi_event - notifier handler that queues up HMI events
322  * to be preocessed later.
323  */
324 static int opal_handle_hmi_event(struct notifier_block *nb,
325                           unsigned long msg_type, void *msg)
326 {
327         unsigned long flags;
328         struct OpalHMIEvent *hmi_evt;
329         struct opal_msg *hmi_msg = msg;
330         struct OpalHmiEvtNode *msg_node;
331
332         /* Sanity Checks */
333         if (msg_type != OPAL_MSG_HMI_EVT)
334                 return 0;
335
336         /* HMI event info starts from param[0] */
337         hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0];
338
339         /* Delay the logging of HMI events to workqueue. */
340         msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
341         if (!msg_node) {
342                 pr_err("HMI: out of memory, Opal message event not handled\n");
343                 return -ENOMEM;
344         }
345         memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(*hmi_evt));
346
347         spin_lock_irqsave(&opal_hmi_evt_lock, flags);
348         list_add(&msg_node->list, &opal_hmi_evt_list);
349         spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
350
351         schedule_work(&hmi_event_work);
352         return 0;
353 }
354
355 static struct notifier_block opal_hmi_handler_nb = {
356         .notifier_call  = opal_handle_hmi_event,
357         .next           = NULL,
358         .priority       = 0,
359 };
360
361 int __init opal_hmi_handler_init(void)
362 {
363         int ret;
364
365         if (!opal_hmi_handler_nb_init) {
366                 ret = opal_message_notifier_register(
367                                 OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb);
368                 if (ret) {
369                         pr_err("%s: Can't register OPAL event notifier (%d)\n",
370                                __func__, ret);
371                         return ret;
372                 }
373                 opal_hmi_handler_nb_init = 1;
374         }
375         return 0;
376 }