Merge branch 'ras-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull ras fixes from Thomas Gleixner: "A set of fixes for RAS/MCE: - Improve the error message when the kernel cannot recover from a MCE so the maximum amount of information gets provided. - Individually check MCE recovery features on SkyLake CPUs instead of assuming none when the CAPID0 register does not advertise the general ability for recovery. - Prevent MCE to output inconsistent messages which first show an error location and then claim that the source is unknown. - Prevent overwriting MCi_STATUS in the attempt to gather more information when a fatal MCE has alreay been detected. This leads to empty status values in the printout and failing to react promptly on the fatal event" * 'ras-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/mce: Fix incorrect "Machine check from unknown source" message x86/mce: Do not overwrite MCi_STATUS in mce_no_way_out() x86/mce: Check for alternate indication of machine check recovery on Skylake x86/mce: Improve error message when kernel cannot recover
This commit is contained in:
commit
a43de48993
|
@ -160,6 +160,11 @@ static struct severity {
|
|||
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
|
||||
USER
|
||||
),
|
||||
MCESEV(
|
||||
PANIC, "Data load in unrecoverable area of kernel",
|
||||
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
|
||||
KERNEL
|
||||
),
|
||||
#endif
|
||||
MCESEV(
|
||||
PANIC, "Action required: unknown MCACOD",
|
||||
|
|
|
@ -772,23 +772,25 @@ EXPORT_SYMBOL_GPL(machine_check_poll);
|
|||
static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
|
||||
struct pt_regs *regs)
|
||||
{
|
||||
int i, ret = 0;
|
||||
char *tmp;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < mca_cfg.banks; i++) {
|
||||
m->status = mce_rdmsrl(msr_ops.status(i));
|
||||
if (m->status & MCI_STATUS_VAL) {
|
||||
__set_bit(i, validp);
|
||||
if (quirk_no_way_out)
|
||||
quirk_no_way_out(i, m, regs);
|
||||
}
|
||||
if (!(m->status & MCI_STATUS_VAL))
|
||||
continue;
|
||||
|
||||
__set_bit(i, validp);
|
||||
if (quirk_no_way_out)
|
||||
quirk_no_way_out(i, m, regs);
|
||||
|
||||
if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
|
||||
mce_read_aux(m, i);
|
||||
*msg = tmp;
|
||||
ret = 1;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1205,13 +1207,18 @@ void do_machine_check(struct pt_regs *regs, long error_code)
|
|||
lmce = m.mcgstatus & MCG_STATUS_LMCES;
|
||||
|
||||
/*
|
||||
* Local machine check may already know that we have to panic.
|
||||
* Broadcast machine check begins rendezvous in mce_start()
|
||||
* Go through all banks in exclusion of the other CPUs. This way we
|
||||
* don't report duplicated events on shared banks because the first one
|
||||
* to see it will clear it. If this is a Local MCE, then no need to
|
||||
* perform rendezvous.
|
||||
* to see it will clear it.
|
||||
*/
|
||||
if (!lmce)
|
||||
if (lmce) {
|
||||
if (no_way_out)
|
||||
mce_panic("Fatal local machine check", &m, msg);
|
||||
} else {
|
||||
order = mce_start(&no_way_out);
|
||||
}
|
||||
|
||||
for (i = 0; i < cfg->banks; i++) {
|
||||
__clear_bit(i, toclear);
|
||||
|
@ -1287,12 +1294,17 @@ void do_machine_check(struct pt_regs *regs, long error_code)
|
|||
no_way_out = worst >= MCE_PANIC_SEVERITY;
|
||||
} else {
|
||||
/*
|
||||
* Local MCE skipped calling mce_reign()
|
||||
* If we found a fatal error, we need to panic here.
|
||||
* If there was a fatal machine check we should have
|
||||
* already called mce_panic earlier in this function.
|
||||
* Since we re-read the banks, we might have found
|
||||
* something new. Check again to see if we found a
|
||||
* fatal error. We call "mce_severity()" again to
|
||||
* make sure we have the right "msg".
|
||||
*/
|
||||
if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
|
||||
mce_panic("Machine check from unknown source",
|
||||
NULL, NULL);
|
||||
if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
|
||||
mce_severity(&m, cfg->tolerant, &msg, true);
|
||||
mce_panic("Local fatal machine check!", &m, msg);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -645,12 +645,19 @@ static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev)
|
|||
/* Skylake */
|
||||
static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev)
|
||||
{
|
||||
u32 capid0;
|
||||
u32 capid0, capid5;
|
||||
|
||||
pci_read_config_dword(pdev, 0x84, &capid0);
|
||||
pci_read_config_dword(pdev, 0x98, &capid5);
|
||||
|
||||
if ((capid0 & 0xc0) == 0xc0)
|
||||
/*
|
||||
* CAPID0{7:6} indicate whether this is an advanced RAS SKU
|
||||
* CAPID5{8:5} indicate that various NVDIMM usage modes are
|
||||
* enabled, so memory machine check recovery is also enabled.
|
||||
*/
|
||||
if ((capid0 & 0xc0) == 0xc0 || (capid5 & 0x1e0))
|
||||
static_branch_inc(&mcsafe_key);
|
||||
|
||||
}
|
||||
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap);
|
||||
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ras_cap);
|
||||
|
|
Loading…
Reference in New Issue