Handle all uncorrected error reports in the same way (soft offline

the page). We used to only do that for SRAO (software recoverable
 action optional) machine checks, but it makes sense to also do it
 for UCNA (UnCorrected No Action) logs found by CMCI or polling.
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v1
 
 iQIcBAABAgAGBQJUbnMdAAoJEKurIx+X31iBQZoP/iy1bmIz8SzY3TjtKuTw+s7w
 j8j00QNYiYYz3m6LS+UG/CYA8G2GEm8GUYRGcZcnp8IWV1G5YxoK97QMHHvfvkbS
 apbVpSUtosaJd9eeTc5wbpqvNdPhay+L6kTn/thRNXEC3/82mmzb8kfjp5QP5uV0
 hBTcbHfA9LiGkAHQ8N7xPV/GJjMYnEuWU5U+Ny5IJTUfvLW4461ZgtXVFCJXCgvC
 RBIZAf0OuvOOz4vouiiLbzNdDmpjHk0cVxlnsb0oGfXK2uYsB50LtzmWmfvEbNrI
 xoWcpTi+W9odFNaoCk36T2RD/tQmX3VK9j51pcGmASFD4GJgkoCbB2v8oJmYhZ8t
 TOy2hYHis7jRIaz+q3nG3GXKNbnDM/XfxMbndo9M97MIfm4MpAZj2s9enZ2+ciHa
 HXjAJI+eQb/OarnuFiE8hZt4Z2nbTM+tNR7DDR5swJ1DMdhC1LoKUuXeWf4esfzi
 +4N7HWat8mPL1VbYLleveNVLoS4e56F/uwJQ331MZoulk+co051xjPX8qO7VGj0G
 9L32JNSMjWr7q99PSOe7cbPwmAAg7s5fyEOQRO+D4jTRcHzB0gYSDxewOVLA5UxO
 +rX1YNpdrGF1YWp8bxzCg3gi44YF/0mpph8TDb+Fcq8T3s2C8C+YXsIZPemxZGsY
 aPbK1jJ6oBCr43G02kXS
 =1uYH
 -----END PGP SIGNATURE-----

Merge tag 'please-pull-ucna' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras into x86/ras

Merge RAS updates from Tony Luck:

"Handle all uncorrected error reports in the same way (soft offline
 the page). We used to only do that for SRAO (software recoverable
 action optional) machine checks, but it makes sense to also do it
 for UCNA (UnCorrected No Action) logs found by CMCI or polling."
This commit is contained in:
Thomas Gleixner 2014-11-21 15:30:30 +01:00
commit b9e6df0a2d
5 changed files with 78 additions and 16 deletions

View File

@ -34,6 +34,10 @@
#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ #define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */
#define MCI_STATUS_AR (1ULL<<55) /* Action required */ #define MCI_STATUS_AR (1ULL<<55) /* Action required */
/* AMD-specific bits */
#define MCI_STATUS_DEFERRED (1ULL<<44) /* declare an uncorrected error */
#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */
/* /*
* Note that the full MCACOD field of IA32_MCi_STATUS MSR is * Note that the full MCACOD field of IA32_MCi_STATUS MSR is
* bits 15:0. But bit 12 is the 'F' bit, defined for corrected * bits 15:0. But bit 12 is the 'F' bit, defined for corrected

View File

@ -3,6 +3,8 @@
enum severity_level { enum severity_level {
MCE_NO_SEVERITY, MCE_NO_SEVERITY,
MCE_DEFERRED_SEVERITY,
MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY,
MCE_KEEP_SEVERITY, MCE_KEEP_SEVERITY,
MCE_SOME_SEVERITY, MCE_SOME_SEVERITY,
MCE_AO_SEVERITY, MCE_AO_SEVERITY,
@ -21,7 +23,7 @@ struct mce_bank {
char attrname[ATTR_LEN]; /* attribute name */ char attrname[ATTR_LEN]; /* attribute name */
}; };
int mce_severity(struct mce *a, int tolerant, char **msg); int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp);
struct dentry *mce_get_debugfs_dir(void); struct dentry *mce_get_debugfs_dir(void);
extern struct mce_bank *mce_banks; extern struct mce_bank *mce_banks;

View File

@ -31,6 +31,7 @@
enum context { IN_KERNEL = 1, IN_USER = 2 }; enum context { IN_KERNEL = 1, IN_USER = 2 };
enum ser { SER_REQUIRED = 1, NO_SER = 2 }; enum ser { SER_REQUIRED = 1, NO_SER = 2 };
enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 };
static struct severity { static struct severity {
u64 mask; u64 mask;
@ -40,6 +41,7 @@ static struct severity {
unsigned char mcgres; unsigned char mcgres;
unsigned char ser; unsigned char ser;
unsigned char context; unsigned char context;
unsigned char excp;
unsigned char covered; unsigned char covered;
char *msg; char *msg;
} severities[] = { } severities[] = {
@ -48,6 +50,8 @@ static struct severity {
#define USER .context = IN_USER #define USER .context = IN_USER
#define SER .ser = SER_REQUIRED #define SER .ser = SER_REQUIRED
#define NOSER .ser = NO_SER #define NOSER .ser = NO_SER
#define EXCP .excp = EXCP_CONTEXT
#define NOEXCP .excp = NO_EXCP
#define BITCLR(x) .mask = x, .result = 0 #define BITCLR(x) .mask = x, .result = 0
#define BITSET(x) .mask = x, .result = x #define BITSET(x) .mask = x, .result = x
#define MCGMASK(x, y) .mcgmask = x, .mcgres = y #define MCGMASK(x, y) .mcgmask = x, .mcgres = y
@ -62,7 +66,7 @@ static struct severity {
), ),
MCESEV( MCESEV(
NO, "Not enabled", NO, "Not enabled",
BITCLR(MCI_STATUS_EN) EXCP, BITCLR(MCI_STATUS_EN)
), ),
MCESEV( MCESEV(
PANIC, "Processor context corrupt", PANIC, "Processor context corrupt",
@ -71,16 +75,20 @@ static struct severity {
/* When MCIP is not set something is very confused */ /* When MCIP is not set something is very confused */
MCESEV( MCESEV(
PANIC, "MCIP not set in MCA handler", PANIC, "MCIP not set in MCA handler",
MCGMASK(MCG_STATUS_MCIP, 0) EXCP, MCGMASK(MCG_STATUS_MCIP, 0)
), ),
/* Neither return not error IP -- no chance to recover -> PANIC */ /* Neither return not error IP -- no chance to recover -> PANIC */
MCESEV( MCESEV(
PANIC, "Neither restart nor error IP", PANIC, "Neither restart nor error IP",
MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) EXCP, MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
), ),
MCESEV( MCESEV(
PANIC, "In kernel and no restart IP", PANIC, "In kernel and no restart IP",
KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
),
MCESEV(
DEFERRED, "Deferred error",
NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED)
), ),
MCESEV( MCESEV(
KEEP, "Corrected error", KEEP, "Corrected error",
@ -89,7 +97,7 @@ static struct severity {
/* ignore OVER for UCNA */ /* ignore OVER for UCNA */
MCESEV( MCESEV(
KEEP, "Uncorrected no action required", UCNA, "Uncorrected no action required",
SER, MASK(MCI_UC_SAR, MCI_STATUS_UC) SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
), ),
MCESEV( MCESEV(
@ -178,8 +186,9 @@ static int error_context(struct mce *m)
return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
} }
int mce_severity(struct mce *m, int tolerant, char **msg) int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
{ {
enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
enum context ctx = error_context(m); enum context ctx = error_context(m);
struct severity *s; struct severity *s;
@ -194,6 +203,8 @@ int mce_severity(struct mce *m, int tolerant, char **msg)
continue; continue;
if (s->context && ctx != s->context) if (s->context && ctx != s->context)
continue; continue;
if (s->excp && excp != s->excp)
continue;
if (msg) if (msg)
*msg = s->msg; *msg = s->msg;
s->covered = 1; s->covered = 1;

View File

@ -575,6 +575,37 @@ static void mce_read_aux(struct mce *m, int i)
} }
} }
static bool memory_error(struct mce *m)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
if (c->x86_vendor == X86_VENDOR_AMD) {
/*
* coming soon
*/
return false;
} else if (c->x86_vendor == X86_VENDOR_INTEL) {
/*
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
*
* Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
* indicating a memory error. Bit 8 is used for indicating a
* cache hierarchy error. The combination of bit 2 and bit 3
* is used for indicating a `generic' cache hierarchy error
* But we can't just blindly check the above bits, because if
* bit 11 is set, then it is a bus/interconnect error - and
* either way the above bits just gives more detail on what
* bus/interconnect error happened. Note that bit 12 can be
* ignored, as it's the "filter" bit.
*/
return (m->status & 0xef80) == BIT(7) ||
(m->status & 0xef00) == BIT(8) ||
(m->status & 0xeffc) == 0xc;
}
return false;
}
DEFINE_PER_CPU(unsigned, mce_poll_count); DEFINE_PER_CPU(unsigned, mce_poll_count);
/* /*
@ -595,6 +626,7 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{ {
struct mce m; struct mce m;
int severity;
int i; int i;
this_cpu_inc(mce_poll_count); this_cpu_inc(mce_poll_count);
@ -630,6 +662,20 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
if (!(flags & MCP_TIMESTAMP)) if (!(flags & MCP_TIMESTAMP))
m.tsc = 0; m.tsc = 0;
severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
/*
* In the cases where we don't have a valid address after all,
* do not add it into the ring buffer.
*/
if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) {
if (m.status & MCI_STATUS_ADDRV) {
mce_ring_add(m.addr >> PAGE_SHIFT);
mce_schedule_work();
}
}
/* /*
* Don't get the IP here because it's unlikely to * Don't get the IP here because it's unlikely to
* have anything to do with the actual error location. * have anything to do with the actual error location.
@ -668,7 +714,8 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
if (quirk_no_way_out) if (quirk_no_way_out)
quirk_no_way_out(i, m, regs); quirk_no_way_out(i, m, regs);
} }
if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY) if (mce_severity(m, mca_cfg.tolerant, msg, true) >=
MCE_PANIC_SEVERITY)
ret = 1; ret = 1;
} }
return ret; return ret;
@ -754,7 +801,7 @@ static void mce_reign(void)
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
int severity = mce_severity(&per_cpu(mces_seen, cpu), int severity = mce_severity(&per_cpu(mces_seen, cpu),
mca_cfg.tolerant, mca_cfg.tolerant,
&nmsg); &nmsg, true);
if (severity > global_worst) { if (severity > global_worst) {
msg = nmsg; msg = nmsg;
global_worst = severity; global_worst = severity;
@ -1095,13 +1142,14 @@ void do_machine_check(struct pt_regs *regs, long error_code)
*/ */
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
severity = mce_severity(&m, cfg->tolerant, NULL); severity = mce_severity(&m, cfg->tolerant, NULL, true);
/* /*
* When machine check was for corrected handler don't touch, * When machine check was for corrected/deferred handler don't
* unless we're panicing. * touch, unless we're panicing.
*/ */
if (severity == MCE_KEEP_SEVERITY && !no_way_out) if ((severity == MCE_KEEP_SEVERITY ||
severity == MCE_UCNA_SEVERITY) && !no_way_out)
continue; continue;
__set_bit(i, toclear); __set_bit(i, toclear);
if (severity == MCE_NO_SEVERITY) { if (severity == MCE_NO_SEVERITY) {

View File

@ -32,9 +32,6 @@
#define R4(x) (((x) >> 4) & 0xf) #define R4(x) (((x) >> 4) & 0xf)
#define R4_MSG(x) ((R4(x) < 9) ? rrrr_msgs[R4(x)] : "Wrong R4!") #define R4_MSG(x) ((R4(x) < 9) ? rrrr_msgs[R4(x)] : "Wrong R4!")
#define MCI_STATUS_DEFERRED BIT_64(44)
#define MCI_STATUS_POISON BIT_64(43)
extern const char * const pp_msgs[]; extern const char * const pp_msgs[];
enum tt_ids { enum tt_ids {