x86/mce: Simplify AMD severity grading logic

The MCE handler needs to understand the severity of the machine errors to
act accordingly. Simplify the AMD grading logic following a logic that
closely resembles the descriptions of the public PPR documents. This will
help include more fine-grained grading of errors in the future.

  [ bp: Touchups. ]

Signed-off-by: Carlos Bilbao <carlos.bilbao@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com>
Link: https://lore.kernel.org/r/20220405183212.354606-2-carlos.bilbao@amd.com
This commit is contained in:
Carlos Bilbao 2022-04-05 13:32:13 -05:00 committed by Borislav Petkov
parent e5f28623ce
commit 70c459d915
1 changed files with 38 additions and 67 deletions

View File

@ -301,85 +301,56 @@ static noinstr int error_context(struct mce *m, struct pt_regs *regs)
}
}
static __always_inline int mce_severity_amd_smca(struct mce *m, enum context err_ctx)
{
u64 mcx_cfg;
/*
* We need to look at the following bits:
* - "succor" bit (data poisoning support), and
* - TCC bit (Task Context Corrupt)
* in MCi_STATUS to determine error severity.
*/
if (!mce_flags.succor)
return MCE_PANIC_SEVERITY;
mcx_cfg = mce_rdmsrl(MSR_AMD64_SMCA_MCx_CONFIG(m->bank));
/* TCC (Task context corrupt). If set and if IN_KERNEL, panic. */
if ((mcx_cfg & MCI_CONFIG_MCAX) &&
(m->status & MCI_STATUS_TCC) &&
(err_ctx == IN_KERNEL))
return MCE_PANIC_SEVERITY;
/* ...otherwise invoke hwpoison handler. */
return MCE_AR_SEVERITY;
}
/*
* See AMD Error Scope Hierarchy table in a newer BKDG. For example
* 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
*/
/* See AMD PPR(s) section Machine Check Error Handling. */
static noinstr int mce_severity_amd(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp)
{
enum context ctx = error_context(m, regs);
int ret;
/*
* Default return value: Action required, the error must be handled
* immediately.
*/
ret = MCE_AR_SEVERITY;
/* Processor Context Corrupt, no need to fumble too much, die! */
if (m->status & MCI_STATUS_PCC)
return MCE_PANIC_SEVERITY;
if (m->status & MCI_STATUS_PCC) {
ret = MCE_PANIC_SEVERITY;
goto out;
}
if (m->status & MCI_STATUS_UC) {
if (ctx == IN_KERNEL)
return MCE_PANIC_SEVERITY;
/*
* On older systems where overflow_recov flag is not present, we
* should simply panic if an error overflow occurs. If
* overflow_recov flag is present and set, then software can try
* to at least kill process to prolong system operation.
*/
if (mce_flags.overflow_recov) {
if (mce_flags.smca)
return mce_severity_amd_smca(m, ctx);
/* kill current process */
return MCE_AR_SEVERITY;
} else {
/* at least one error was not logged */
if (m->status & MCI_STATUS_OVER)
return MCE_PANIC_SEVERITY;
}
/*
* For any other case, return MCE_UC_SEVERITY so that we log the
* error and exit #MC handler.
*/
return MCE_UC_SEVERITY;
if (m->status & MCI_STATUS_DEFERRED) {
ret = MCE_DEFERRED_SEVERITY;
goto out;
}
/*
* deferred error: poll handler catches these and adds to mce_ring so
* memory-failure can take recovery actions.
* If the UC bit is not set, the system either corrected or deferred
* the error. No action will be required after logging the error.
*/
if (m->status & MCI_STATUS_DEFERRED)
return MCE_DEFERRED_SEVERITY;
if (!(m->status & MCI_STATUS_UC)) {
ret = MCE_KEEP_SEVERITY;
goto out;
}
/*
* corrected error: poll handler catches these and passes responsibility
* of decoding the error to EDAC
* On MCA overflow, without the MCA overflow recovery feature the
* system will not be able to recover, panic.
*/
return MCE_KEEP_SEVERITY;
if ((m->status & MCI_STATUS_OVER) && !mce_flags.overflow_recov) {
ret = MCE_PANIC_SEVERITY;
goto out;
}
if (!mce_flags.succor) {
ret = MCE_PANIC_SEVERITY;
goto out;
}
if (error_context(m, regs) == IN_KERNEL)
ret = MCE_PANIC_SEVERITY;
out:
return ret;
}
static noinstr int mce_severity_intel(struct mce *m, struct pt_regs *regs, char **msg, bool is_excp)