summaryrefslogtreecommitdiffstats
path: root/arch/powerpc/kernel/mce.c
diff options
context:
space:
mode:
authorMahesh Salgaonkar <mahesh@linux.vnet.ibm.com>2019-04-29 20:16:02 +0200
committerMichael Ellerman <mpe@ellerman.id.au>2019-05-01 14:23:20 +0200
commit50dbabe06a6e1c35980154ea1fac2ed6ad28652b (patch)
treebb698e0688b3c4205117c843815269eb4e305a97 /arch/powerpc/kernel/mce.c
parentpowerpc/powernv/mce: Print correct severity for MCE error. (diff)
downloadlinux-50dbabe06a6e1c35980154ea1fac2ed6ad28652b.tar.xz
linux-50dbabe06a6e1c35980154ea1fac2ed6ad28652b.zip
powerpc/powernv/mce: Print additional information about MCE error.
Print more information about MCE error whether it is an hardware or software error. Some of the MCE errors can be easily categorized as hardware or software errors e.g. UEs are due to hardware error, where as error triggered due to invalid usage of tlbie is a pure software bug. But not all the MCE errors can be easily categorize into either software or hardware. There are errors like multihit errors which are usually result of a software bug, but in some rare cases a hardware failure can cause a multihit error. In past, we have seen case where after replacing faulty chip, multihit errors stopped occurring. Same with parity errors, which are usually due to faulty hardware but there are chances where multihit can also cause an parity error. Such errors are difficult to determine what really caused it. Hence this patch classifies MCE errors into following four categorize: 1. Hardware error: UE and Link timeout failure errors. 2. Probable hardware error (some chance of software cause) SLB/ERAT/TLB Parity errors. 3. Software error Invalid tlbie form. 4. Probable software error (some chance of hardware cause) SLB/ERAT/TLB Multihit errors. Sample output: MCE: CPU80: machine check (Warning) Guest SLB Multihit DAR: 000001001b6e0320 [Recovered] MCE: CPU80: PID: 24765 Comm: qemu-system-ppc Guest NIP: [00007fffa309dc60] MCE: CPU80: Probable Software error (some chance of hardware cause) Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Diffstat (limited to 'arch/powerpc/kernel/mce.c')
-rw-r--r--arch/powerpc/kernel/mce.c12
1 files changed, 12 insertions, 0 deletions
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 71d245a387ab..4581377cfc98 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -123,6 +123,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
mce->initiator = mce_err->initiator;
mce->severity = mce_err->severity;
mce->sync_error = mce_err->sync_error;
+ mce->error_class = mce_err->error_class;
/*
* Populate the mce error_type and type-specific error_type.
@@ -363,6 +364,13 @@ void machine_check_print_event_info(struct machine_check_event *evt,
"Store (timeout)",
"Page table walk Load/Store (timeout)",
};
+ static const char *mc_error_class[] = {
+ "Unknown",
+ "Hardware error",
+ "Probable Hardware error (some chance of software cause)",
+ "Software error",
+ "Probable Software error (some chance of hardware cause)",
+ };
/* Print things out */
if (evt->version != MCE_V1) {
@@ -487,6 +495,10 @@ void machine_check_print_event_info(struct machine_check_event *evt,
printk("%sMCE: CPU%d: NIP: [%016llx] %pS%s\n",
level, evt->cpu, evt->srr0, (void *)evt->srr0, pa_str);
}
+
+ subtype = evt->error_class < ARRAY_SIZE(mc_error_class) ?
+ mc_error_class[evt->error_class] : "Unknown";
+ printk("%sMCE: CPU%d: %s\n", level, evt->cpu, subtype);
}
EXPORT_SYMBOL_GPL(machine_check_print_event_info);