Commit 50dbabe0 authored by Mahesh Salgaonkar's avatar Mahesh Salgaonkar Committed by Michael Ellerman

powerpc/powernv/mce: Print additional information about MCE error.

Print more information about MCE error whether it is an hardware or
software error.

Some of the MCE errors can be easily categorized as hardware or
software errors e.g. UEs are due to hardware error, where as error
triggered due to invalid usage of tlbie is a pure software bug. But
not all the MCE errors can be easily categorize into either software
or hardware. There are errors like multihit errors which are usually
result of a software bug, but in some rare cases a hardware failure
can cause a multihit error. In past, we have seen case where after
replacing faulty chip, multihit errors stopped occurring. Same with
parity errors, which are usually due to faulty hardware but there are
chances where multihit can also cause an parity error. Such errors are
difficult to determine what really caused it. Hence this patch
classifies MCE errors into following four categorize:

  1. Hardware error:
  	UE and Link timeout failure errors.
  2. Probable hardware error (some chance of software cause)
  	SLB/ERAT/TLB Parity errors.
  3. Software error
  	Invalid tlbie form.
  4. Probable software error (some chance of hardware cause)
  	SLB/ERAT/TLB Multihit errors.

Sample output:

  MCE: CPU80: machine check (Warning) Guest SLB Multihit DAR: 000001001b6e0320 [Recovered]
  MCE: CPU80: PID: 24765 Comm: qemu-system-ppc Guest NIP: [00007fffa309dc60]
  MCE: CPU80: Probable Software error (some chance of hardware cause)
Signed-off-by: default avatarMahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: default avatarMichael Ellerman <mpe@ellerman.id.au>
parent cda6618d
......@@ -56,6 +56,14 @@ enum MCE_ErrorType {
MCE_ERROR_TYPE_LINK = 7,
};
enum MCE_ErrorClass {
MCE_ECLASS_UNKNOWN = 0,
MCE_ECLASS_HARDWARE,
MCE_ECLASS_HARD_INDETERMINATE,
MCE_ECLASS_SOFTWARE,
MCE_ECLASS_SOFT_INDETERMINATE,
};
enum MCE_UeErrorType {
MCE_UE_ERROR_INDETERMINATE = 0,
MCE_UE_ERROR_IFETCH = 1,
......@@ -115,6 +123,7 @@ struct machine_check_event {
enum MCE_Severity severity:8;
enum MCE_Initiator initiator:8;
enum MCE_ErrorType error_type:8;
enum MCE_ErrorClass error_class:8;
enum MCE_Disposition disposition:8;
bool sync_error;
u16 cpu;
......@@ -195,6 +204,7 @@ struct mce_error_info {
} u;
enum MCE_Severity severity:8;
enum MCE_Initiator initiator:8;
enum MCE_ErrorClass error_class:8;
bool sync_error;
};
......
......@@ -123,6 +123,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
mce->initiator = mce_err->initiator;
mce->severity = mce_err->severity;
mce->sync_error = mce_err->sync_error;
mce->error_class = mce_err->error_class;
/*
* Populate the mce error_type and type-specific error_type.
......@@ -363,6 +364,13 @@ void machine_check_print_event_info(struct machine_check_event *evt,
"Store (timeout)",
"Page table walk Load/Store (timeout)",
};
static const char *mc_error_class[] = {
"Unknown",
"Hardware error",
"Probable Hardware error (some chance of software cause)",
"Software error",
"Probable Software error (some chance of hardware cause)",
};
/* Print things out */
if (evt->version != MCE_V1) {
......@@ -487,6 +495,10 @@ void machine_check_print_event_info(struct machine_check_event *evt,
printk("%sMCE: CPU%d: NIP: [%016llx] %pS%s\n",
level, evt->cpu, evt->srr0, (void *)evt->srr0, pa_str);
}
subtype = evt->error_class < ARRAY_SIZE(mc_error_class) ?
mc_error_class[evt->error_class] : "Unknown";
printk("%sMCE: CPU%d: %s\n", level, evt->cpu, subtype);
}
EXPORT_SYMBOL_GPL(machine_check_print_event_info);
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment