GNU Linux-libre 4.14.324-gnu1
[releases.git] / arch / powerpc / platforms / powernv / opal-hmi.c
1 /*
2  * OPAL hypervisor Maintenance interrupt handling support in PowreNV.
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; If not, see <http://www.gnu.org/licenses/>.
16  *
17  * Copyright 2014 IBM Corporation
18  * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
19  */
20
21 #undef DEBUG
22
23 #include <linux/kernel.h>
24 #include <linux/init.h>
25 #include <linux/of.h>
26 #include <linux/mm.h>
27 #include <linux/slab.h>
28
29 #include <asm/opal.h>
30 #include <asm/cputable.h>
31 #include <asm/machdep.h>
32
33 #include "powernv.h"
34
35 static int opal_hmi_handler_nb_init;
36 struct OpalHmiEvtNode {
37         struct list_head list;
38         struct OpalHMIEvent hmi_evt;
39 };
40
41 struct xstop_reason {
42         uint32_t xstop_reason;
43         const char *unit_failed;
44         const char *description;
45 };
46
47 static LIST_HEAD(opal_hmi_evt_list);
48 static DEFINE_SPINLOCK(opal_hmi_evt_lock);
49
50 static void print_core_checkstop_reason(const char *level,
51                                         struct OpalHMIEvent *hmi_evt)
52 {
53         int i;
54         static const struct xstop_reason xstop_reason[] = {
55                 { CORE_CHECKSTOP_IFU_REGFILE, "IFU",
56                                 "RegFile core check stop" },
57                 { CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" },
58                 { CORE_CHECKSTOP_PC_DURING_RECOV, "PC",
59                                 "Core checkstop during recovery" },
60                 { CORE_CHECKSTOP_ISU_REGFILE, "ISU",
61                                 "RegFile core check stop (mapper error)" },
62                 { CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" },
63                 { CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" },
64                 { CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" },
65                 { CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC",
66                                 "Recovery in maintenance mode" },
67                 { CORE_CHECKSTOP_LSU_REGFILE, "LSU",
68                                 "RegFile core check stop" },
69                 { CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC",
70                                 "Forward Progress Error" },
71                 { CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" },
72                 { CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" },
73                 { CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC",
74                                 "Hypervisor Resource error - core check stop" },
75                 { CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC",
76                                 "Hang Recovery Failed (core check stop)" },
77                 { CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC",
78                                 "Ambiguous Hang Detected (unknown source)" },
79                 { CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC",
80                                 "Debug Trigger Error inject" },
81                 { CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC",
82                                 "Hypervisor check stop via SPRC/SPRD" },
83         };
84
85         /* Validity check */
86         if (!hmi_evt->u.xstop_error.xstop_reason) {
87                 printk("%s      Unknown Core check stop.\n", level);
88                 return;
89         }
90
91         printk("%s      CPU PIR: %08x\n", level,
92                         be32_to_cpu(hmi_evt->u.xstop_error.u.pir));
93         for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
94                 if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
95                                         xstop_reason[i].xstop_reason)
96                         printk("%s      [Unit: %-3s] %s\n", level,
97                                         xstop_reason[i].unit_failed,
98                                         xstop_reason[i].description);
99 }
100
101 static void print_nx_checkstop_reason(const char *level,
102                                         struct OpalHMIEvent *hmi_evt)
103 {
104         int i;
105         static const struct xstop_reason xstop_reason[] = {
106                 { NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine",
107                                         "SHM invalid state error" },
108                 { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine",
109                                         "DMA invalid state error bit 15" },
110                 { NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine",
111                                         "DMA invalid state error bit 16" },
112                 { NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine",
113                                         "Channel 0 invalid state error" },
114                 { NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine",
115                                         "Channel 1 invalid state error" },
116                 { NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine",
117                                         "Channel 2 invalid state error" },
118                 { NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine",
119                                         "Channel 3 invalid state error" },
120                 { NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine",
121                                         "Channel 4 invalid state error" },
122                 { NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine",
123                                         "Channel 5 invalid state error" },
124                 { NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine",
125                                         "Channel 6 invalid state error" },
126                 { NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine",
127                                         "Channel 7 invalid state error" },
128                 { NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine",
129                                         "UE error on CRB(CSB address, CCB)" },
130                 { NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine",
131                                         "SUE error on CRB(CSB address, CCB)" },
132                 { NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface",
133                 "CRB Kill ISN received while holding ISN with UE error" },
134         };
135
136         /* Validity check */
137         if (!hmi_evt->u.xstop_error.xstop_reason) {
138                 printk("%s      Unknown NX check stop.\n", level);
139                 return;
140         }
141
142         printk("%s      NX checkstop on CHIP ID: %x\n", level,
143                         be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
144         for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
145                 if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
146                                         xstop_reason[i].xstop_reason)
147                         printk("%s      [Unit: %-3s] %s\n", level,
148                                         xstop_reason[i].unit_failed,
149                                         xstop_reason[i].description);
150 }
151
152 static void print_checkstop_reason(const char *level,
153                                         struct OpalHMIEvent *hmi_evt)
154 {
155         uint8_t type = hmi_evt->u.xstop_error.xstop_type;
156         switch (type) {
157         case CHECKSTOP_TYPE_CORE:
158                 print_core_checkstop_reason(level, hmi_evt);
159                 break;
160         case CHECKSTOP_TYPE_NX:
161                 print_nx_checkstop_reason(level, hmi_evt);
162                 break;
163         default:
164                 printk("%s      Unknown Malfunction Alert of type %d\n",
165                        level, type);
166                 break;
167         }
168 }
169
170 static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
171 {
172         const char *level, *sevstr, *error_info;
173         static const char *hmi_error_types[] = {
174                 "Malfunction Alert",
175                 "Processor Recovery done",
176                 "Processor recovery occurred again",
177                 "Processor recovery occurred for masked error",
178                 "Timer facility experienced an error",
179                 "TFMR SPR is corrupted",
180                 "UPS (Uniterrupted Power System) Overflow indication",
181                 "An XSCOM operation failure",
182                 "An XSCOM operation completed",
183                 "SCOM has set a reserved FIR bit to cause recovery",
184                 "Debug trigger has set a reserved FIR bit to cause recovery",
185                 "A hypervisor resource error occurred",
186                 "CAPP recovery process is in progress",
187         };
188
189         /* Print things out */
190         if (hmi_evt->version < OpalHMIEvt_V1) {
191                 pr_err("HMI Interrupt, Unknown event version %d !\n",
192                         hmi_evt->version);
193                 return;
194         }
195         switch (hmi_evt->severity) {
196         case OpalHMI_SEV_NO_ERROR:
197                 level = KERN_INFO;
198                 sevstr = "Harmless";
199                 break;
200         case OpalHMI_SEV_WARNING:
201                 level = KERN_WARNING;
202                 sevstr = "";
203                 break;
204         case OpalHMI_SEV_ERROR_SYNC:
205                 level = KERN_ERR;
206                 sevstr = "Severe";
207                 break;
208         case OpalHMI_SEV_FATAL:
209         default:
210                 level = KERN_ERR;
211                 sevstr = "Fatal";
212                 break;
213         }
214
215         printk("%s%s Hypervisor Maintenance interrupt [%s]\n",
216                 level, sevstr,
217                 hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?
218                 "Recovered" : "Not recovered");
219         error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?
220                         hmi_error_types[hmi_evt->type]
221                         : "Unknown";
222         printk("%s Error detail: %s\n", level, error_info);
223         printk("%s      HMER: %016llx\n", level, be64_to_cpu(hmi_evt->hmer));
224         if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||
225                 (hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
226                 printk("%s      TFMR: %016llx\n", level,
227                                                 be64_to_cpu(hmi_evt->tfmr));
228
229         if (hmi_evt->version < OpalHMIEvt_V2)
230                 return;
231
232         /* OpalHMIEvt_V2 and above provides reason for malfunction alert. */
233         if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT)
234                 print_checkstop_reason(level, hmi_evt);
235 }
236
237 static void hmi_event_handler(struct work_struct *work)
238 {
239         unsigned long flags;
240         struct OpalHMIEvent *hmi_evt;
241         struct OpalHmiEvtNode *msg_node;
242         uint8_t disposition;
243         struct opal_msg msg;
244         int unrecoverable = 0;
245
246         spin_lock_irqsave(&opal_hmi_evt_lock, flags);
247         while (!list_empty(&opal_hmi_evt_list)) {
248                 msg_node = list_entry(opal_hmi_evt_list.next,
249                                            struct OpalHmiEvtNode, list);
250                 list_del(&msg_node->list);
251                 spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
252
253                 hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt;
254                 print_hmi_event_info(hmi_evt);
255                 disposition = hmi_evt->disposition;
256                 kfree(msg_node);
257
258                 /*
259                  * Check if HMI event has been recovered or not. If not
260                  * then kernel can't continue, we need to panic.
261                  * But before we do that, display all the HMI event
262                  * available on the list and set unrecoverable flag to 1.
263                  */
264                 if (disposition != OpalHMI_DISPOSITION_RECOVERED)
265                         unrecoverable = 1;
266
267                 spin_lock_irqsave(&opal_hmi_evt_lock, flags);
268         }
269         spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
270
271         if (unrecoverable) {
272                 /* Pull all HMI events from OPAL before we panic. */
273                 while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) {
274                         u32 type;
275
276                         type = be32_to_cpu(msg.msg_type);
277
278                         /* skip if not HMI event */
279                         if (type != OPAL_MSG_HMI_EVT)
280                                 continue;
281
282                         /* HMI event info starts from param[0] */
283                         hmi_evt = (struct OpalHMIEvent *)&msg.params[0];
284                         print_hmi_event_info(hmi_evt);
285                 }
286
287                 pnv_platform_error_reboot(NULL, "Unrecoverable HMI exception");
288         }
289 }
290
291 static DECLARE_WORK(hmi_event_work, hmi_event_handler);
292 /*
293  * opal_handle_hmi_event - notifier handler that queues up HMI events
294  * to be preocessed later.
295  */
296 static int opal_handle_hmi_event(struct notifier_block *nb,
297                           unsigned long msg_type, void *msg)
298 {
299         unsigned long flags;
300         struct OpalHMIEvent *hmi_evt;
301         struct opal_msg *hmi_msg = msg;
302         struct OpalHmiEvtNode *msg_node;
303
304         /* Sanity Checks */
305         if (msg_type != OPAL_MSG_HMI_EVT)
306                 return 0;
307
308         /* HMI event info starts from param[0] */
309         hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0];
310
311         /* Delay the logging of HMI events to workqueue. */
312         msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
313         if (!msg_node) {
314                 pr_err("HMI: out of memory, Opal message event not handled\n");
315                 return -ENOMEM;
316         }
317         memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(struct OpalHMIEvent));
318
319         spin_lock_irqsave(&opal_hmi_evt_lock, flags);
320         list_add(&msg_node->list, &opal_hmi_evt_list);
321         spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
322
323         schedule_work(&hmi_event_work);
324         return 0;
325 }
326
327 static struct notifier_block opal_hmi_handler_nb = {
328         .notifier_call  = opal_handle_hmi_event,
329         .next           = NULL,
330         .priority       = 0,
331 };
332
333 int __init opal_hmi_handler_init(void)
334 {
335         int ret;
336
337         if (!opal_hmi_handler_nb_init) {
338                 ret = opal_message_notifier_register(
339                                 OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb);
340                 if (ret) {
341                         pr_err("%s: Can't register OPAL event notifier (%d)\n",
342                                __func__, ret);
343                         return ret;
344                 }
345                 opal_hmi_handler_nb_init = 1;
346         }
347         return 0;
348 }