x86/mce: Add quirk for instruction recovery on Sandy Bridge processors
Sandy Bridge processors follow the SDM (Vol 3B, Table 15-20) and set both the RIPV and EIPV bits in the MCG_STATUS register to zero for machine checks during instruction fetch. This is more than a little counter-intuitive and means that Linux cannot recover from these errors. Rather than insert special case code at several places in mce.c and mce-severity.c, we pretend the EIPV bit was set for just this case early in processing the machine check. Acked-by: Borislav Petkov <bp@amd64.org> Signed-off-by: Tony Luck <tony.luck@intel.com> Cc: Chen Gong <gong.chen@linux.intel.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> Link: http://lkml.kernel.org/r/180a06f3f357cf9f78259ae443a082b14a29535b.1343078495.git.tony.luck@intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
parent
736edce5f3
commit
61b0fccd7f
|
@ -105,6 +105,8 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
|
||||||
|
|
||||||
static DEFINE_PER_CPU(struct work_struct, mce_work);
|
static DEFINE_PER_CPU(struct work_struct, mce_work);
|
||||||
|
|
||||||
|
static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* CPU/chipset specific EDAC code can register a notifier call here to print
|
* CPU/chipset specific EDAC code can register a notifier call here to print
|
||||||
* MCE errors in a human-readable form.
|
* MCE errors in a human-readable form.
|
||||||
|
@ -652,14 +654,18 @@ EXPORT_SYMBOL_GPL(machine_check_poll);
|
||||||
* Do a quick check if any of the events requires a panic.
|
* Do a quick check if any of the events requires a panic.
|
||||||
* This decides if we keep the events around or clear them.
|
* This decides if we keep the events around or clear them.
|
||||||
*/
|
*/
|
||||||
static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp)
|
static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
|
||||||
|
struct pt_regs *regs)
|
||||||
{
|
{
|
||||||
int i, ret = 0;
|
int i, ret = 0;
|
||||||
|
|
||||||
for (i = 0; i < banks; i++) {
|
for (i = 0; i < banks; i++) {
|
||||||
m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
|
m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
|
||||||
if (m->status & MCI_STATUS_VAL)
|
if (m->status & MCI_STATUS_VAL) {
|
||||||
__set_bit(i, validp);
|
__set_bit(i, validp);
|
||||||
|
if (quirk_no_way_out)
|
||||||
|
quirk_no_way_out(i, m, regs);
|
||||||
|
}
|
||||||
if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
|
if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
|
||||||
ret = 1;
|
ret = 1;
|
||||||
}
|
}
|
||||||
|
@ -1042,7 +1048,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
|
||||||
*final = m;
|
*final = m;
|
||||||
|
|
||||||
memset(valid_banks, 0, sizeof(valid_banks));
|
memset(valid_banks, 0, sizeof(valid_banks));
|
||||||
no_way_out = mce_no_way_out(&m, &msg, valid_banks);
|
no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
|
||||||
|
|
||||||
barrier();
|
barrier();
|
||||||
|
|
||||||
|
@ -1418,6 +1424,34 @@ static void __mcheck_cpu_init_generic(void)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
|
||||||
|
* EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
|
||||||
|
* Vol 3B Table 15-20). But this confuses both the code that determines
|
||||||
|
* whether the machine check occurred in kernel or user mode, and also
|
||||||
|
* the severity assessment code. Pretend that EIPV was set, and take the
|
||||||
|
* ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
|
||||||
|
*/
|
||||||
|
static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
|
||||||
|
{
|
||||||
|
if (bank != 0)
|
||||||
|
return;
|
||||||
|
if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
|
||||||
|
return;
|
||||||
|
if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
|
||||||
|
MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
|
||||||
|
MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
|
||||||
|
MCACOD)) !=
|
||||||
|
(MCI_STATUS_UC|MCI_STATUS_EN|
|
||||||
|
MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
|
||||||
|
MCI_STATUS_AR|MCACOD_INSTR))
|
||||||
|
return;
|
||||||
|
|
||||||
|
m->mcgstatus |= MCG_STATUS_EIPV;
|
||||||
|
m->ip = regs->ip;
|
||||||
|
m->cs = regs->cs;
|
||||||
|
}
|
||||||
|
|
||||||
/* Add per CPU specific workarounds here */
|
/* Add per CPU specific workarounds here */
|
||||||
static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
|
static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
|
||||||
{
|
{
|
||||||
|
@ -1515,6 +1549,9 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
|
||||||
*/
|
*/
|
||||||
if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)
|
if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)
|
||||||
mce_bootlog = 0;
|
mce_bootlog = 0;
|
||||||
|
|
||||||
|
if (c->x86 == 6 && c->x86_model == 45)
|
||||||
|
quirk_no_way_out = quirk_sandybridge_ifu;
|
||||||
}
|
}
|
||||||
if (monarch_timeout < 0)
|
if (monarch_timeout < 0)
|
||||||
monarch_timeout = 0;
|
monarch_timeout = 0;
|
||||||
|
|
Loading…
Reference in New Issue