x86/mce: Simplify AMD severity grading logic
The MCE handler needs to understand the severity of the machine errors to act accordingly. Simplify the AMD grading logic following a logic that closely resembles the descriptions of the public PPR documents. This will help include more fine-grained grading of errors in the future. [ bp: Touchups. ] Signed-off-by: Carlos Bilbao <carlos.bilbao@amd.com> Signed-off-by: Borislav Petkov <bp@suse.de> Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com> Link: https://lore.kernel.org/r/20220405183212.354606-2-carlos.bilbao@amd.com
This commit is contained in:
parent
e5f28623ce
commit
70c459d915
|
@ -301,85 +301,56 @@ static noinstr int error_context(struct mce *m, struct pt_regs *regs)
|
|||
}
|
||||
}
|
||||
|
||||
static __always_inline int mce_severity_amd_smca(struct mce *m, enum context err_ctx)
|
||||
{
|
||||
u64 mcx_cfg;
|
||||
|
||||
/*
|
||||
* We need to look at the following bits:
|
||||
* - "succor" bit (data poisoning support), and
|
||||
* - TCC bit (Task Context Corrupt)
|
||||
* in MCi_STATUS to determine error severity.
|
||||
*/
|
||||
if (!mce_flags.succor)
|
||||
return MCE_PANIC_SEVERITY;
|
||||
|
||||
mcx_cfg = mce_rdmsrl(MSR_AMD64_SMCA_MCx_CONFIG(m->bank));
|
||||
|
||||
/* TCC (Task context corrupt). If set and if IN_KERNEL, panic. */
|
||||
if ((mcx_cfg & MCI_CONFIG_MCAX) &&
|
||||
(m->status & MCI_STATUS_TCC) &&
|
||||
(err_ctx == IN_KERNEL))
|
||||
return MCE_PANIC_SEVERITY;
|
||||
|
||||
/* ...otherwise invoke hwpoison handler. */
|
||||
return MCE_AR_SEVERITY;
|
||||
}
|
||||
|
||||
/*
|
||||
* See AMD Error Scope Hierarchy table in a newer BKDG. For example
|
||||
* 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
|
||||
*/
|
||||
/* See AMD PPR(s) section Machine Check Error Handling. */
|
||||
static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp)
|
||||
{
|
||||
enum context ctx = error_context(m, regs);
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* Default return value: Action required, the error must be handled
|
||||
* immediately.
|
||||
*/
|
||||
ret = MCE_AR_SEVERITY;
|
||||
|
||||
/* Processor Context Corrupt, no need to fumble too much, die! */
|
||||
if (m->status & MCI_STATUS_PCC)
|
||||
return MCE_PANIC_SEVERITY;
|
||||
if (m->status & MCI_STATUS_PCC) {
|
||||
ret = MCE_PANIC_SEVERITY;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (m->status & MCI_STATUS_UC) {
|
||||
|
||||
if (ctx == IN_KERNEL)
|
||||
return MCE_PANIC_SEVERITY;
|
||||
|
||||
/*
|
||||
* On older systems where overflow_recov flag is not present, we
|
||||
* should simply panic if an error overflow occurs. If
|
||||
* overflow_recov flag is present and set, then software can try
|
||||
* to at least kill process to prolong system operation.
|
||||
*/
|
||||
if (mce_flags.overflow_recov) {
|
||||
if (mce_flags.smca)
|
||||
return mce_severity_amd_smca(m, ctx);
|
||||
|
||||
/* kill current process */
|
||||
return MCE_AR_SEVERITY;
|
||||
} else {
|
||||
/* at least one error was not logged */
|
||||
if (m->status & MCI_STATUS_OVER)
|
||||
return MCE_PANIC_SEVERITY;
|
||||
}
|
||||
|
||||
/*
|
||||
* For any other case, return MCE_UC_SEVERITY so that we log the
|
||||
* error and exit #MC handler.
|
||||
*/
|
||||
return MCE_UC_SEVERITY;
|
||||
if (m->status & MCI_STATUS_DEFERRED) {
|
||||
ret = MCE_DEFERRED_SEVERITY;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* deferred error: poll handler catches these and adds to mce_ring so
|
||||
* memory-failure can take recovery actions.
|
||||
* If the UC bit is not set, the system either corrected or deferred
|
||||
* the error. No action will be required after logging the error.
|
||||
*/
|
||||
if (m->status & MCI_STATUS_DEFERRED)
|
||||
return MCE_DEFERRED_SEVERITY;
|
||||
if (!(m->status & MCI_STATUS_UC)) {
|
||||
ret = MCE_KEEP_SEVERITY;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* corrected error: poll handler catches these and passes responsibility
|
||||
* of decoding the error to EDAC
|
||||
* On MCA overflow, without the MCA overflow recovery feature the
|
||||
* system will not be able to recover, panic.
|
||||
*/
|
||||
return MCE_KEEP_SEVERITY;
|
||||
if ((m->status & MCI_STATUS_OVER) && !mce_flags.overflow_recov) {
|
||||
ret = MCE_PANIC_SEVERITY;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!mce_flags.succor) {
|
||||
ret = MCE_PANIC_SEVERITY;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (error_context(m, regs) == IN_KERNEL)
|
||||
ret = MCE_PANIC_SEVERITY;
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp)
|
||||
|
|
Loading…
Reference in New Issue