Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 RAS updates from Ingo Molnar: - various AMD SMCA error parsing/reporting improvements (Yazen Ghannam) - extend Intel CMCI error reporting to more cases (Xie XiuQi) * 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/MCE: Make correctable error detection look at the Deferred bit x86/MCE: Report only DRAM ECC as memory errors on AMD systems x86/MCE/AMD: Define a function to get SMCA bank type x86/mce/AMD: Don't set DEF_INT_TYPE in MSR_CU_DEF_ERR on SMCA systems x86/MCE: Extend table to report action optional errors through CMCI too
This commit is contained in:
commit
a1c75e17e7
|
@ -376,6 +376,7 @@ struct smca_bank {
|
|||
extern struct smca_bank smca_banks[MAX_NR_BANKS];
|
||||
|
||||
extern const char *smca_get_long_name(enum smca_bank_types t);
|
||||
extern bool amd_mce_is_memory_error(struct mce *m);
|
||||
|
||||
extern int mce_threshold_create_device(unsigned int cpu);
|
||||
extern int mce_threshold_remove_device(unsigned int cpu);
|
||||
|
@ -384,6 +385,7 @@ extern int mce_threshold_remove_device(unsigned int cpu);
|
|||
|
||||
static inline int mce_threshold_create_device(unsigned int cpu) { return 0; };
|
||||
static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; };
|
||||
static inline bool amd_mce_is_memory_error(struct mce *m) { return false; };
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -59,6 +59,7 @@ static struct severity {
|
|||
#define MCGMASK(x, y) .mcgmask = x, .mcgres = y
|
||||
#define MASK(x, y) .mask = x, .result = y
|
||||
#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
|
||||
#define MCI_UC_AR (MCI_STATUS_UC|MCI_STATUS_AR)
|
||||
#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
|
||||
#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV)
|
||||
|
||||
|
@ -101,6 +102,22 @@ static struct severity {
|
|||
NOSER, BITCLR(MCI_STATUS_UC)
|
||||
),
|
||||
|
||||
/*
|
||||
* known AO MCACODs reported via MCE or CMC:
|
||||
*
|
||||
* SRAO could be signaled either via a machine check exception or
|
||||
* CMCI with the corresponding bit S 1 or 0. So we don't need to
|
||||
* check bit S for SRAO.
|
||||
*/
|
||||
MCESEV(
|
||||
AO, "Action optional: memory scrubbing error",
|
||||
SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD_SCRUBMSK, MCI_STATUS_UC|MCACOD_SCRUB)
|
||||
),
|
||||
MCESEV(
|
||||
AO, "Action optional: last level cache writeback error",
|
||||
SER, MASK(MCI_STATUS_OVER|MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB)
|
||||
),
|
||||
|
||||
/* ignore OVER for UCNA */
|
||||
MCESEV(
|
||||
UCNA, "Uncorrected no action required",
|
||||
|
@ -149,15 +166,6 @@ static struct severity {
|
|||
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
|
||||
),
|
||||
|
||||
/* known AO MCACODs: */
|
||||
MCESEV(
|
||||
AO, "Action optional: memory scrubbing error",
|
||||
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB)
|
||||
),
|
||||
MCESEV(
|
||||
AO, "Action optional: last level cache writeback error",
|
||||
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB)
|
||||
),
|
||||
MCESEV(
|
||||
SOME, "Action optional: unknown MCACOD",
|
||||
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
|
||||
|
|
|
@ -503,10 +503,8 @@ static int mce_usable_address(struct mce *m)
|
|||
bool mce_is_memory_error(struct mce *m)
|
||||
{
|
||||
if (m->cpuvendor == X86_VENDOR_AMD) {
|
||||
/* ErrCodeExt[20:16] */
|
||||
u8 xec = (m->status >> 16) & 0x1f;
|
||||
return amd_mce_is_memory_error(m);
|
||||
|
||||
return (xec == 0x0 || xec == 0x8);
|
||||
} else if (m->cpuvendor == X86_VENDOR_INTEL) {
|
||||
/*
|
||||
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
|
||||
|
@ -530,6 +528,17 @@ bool mce_is_memory_error(struct mce *m)
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(mce_is_memory_error);
|
||||
|
||||
static bool mce_is_correctable(struct mce *m)
|
||||
{
|
||||
if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
|
||||
return false;
|
||||
|
||||
if (m->status & MCI_STATUS_UC)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool cec_add_mce(struct mce *m)
|
||||
{
|
||||
if (!m)
|
||||
|
@ -537,7 +546,7 @@ static bool cec_add_mce(struct mce *m)
|
|||
|
||||
/* We eat only correctable DRAM errors with usable addresses. */
|
||||
if (mce_is_memory_error(m) &&
|
||||
!(m->status & MCI_STATUS_UC) &&
|
||||
mce_is_correctable(m) &&
|
||||
mce_usable_address(m))
|
||||
if (!cec_add_elem(m->addr >> PAGE_SHIFT))
|
||||
return true;
|
||||
|
|
|
@ -110,6 +110,20 @@ const char *smca_get_long_name(enum smca_bank_types t)
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(smca_get_long_name);
|
||||
|
||||
static enum smca_bank_types smca_get_bank_type(struct mce *m)
|
||||
{
|
||||
struct smca_bank *b;
|
||||
|
||||
if (m->bank >= N_SMCA_BANK_TYPES)
|
||||
return N_SMCA_BANK_TYPES;
|
||||
|
||||
b = &smca_banks[m->bank];
|
||||
if (!b->hwid)
|
||||
return N_SMCA_BANK_TYPES;
|
||||
|
||||
return b->hwid->bank_type;
|
||||
}
|
||||
|
||||
static struct smca_hwid smca_hwid_mcatypes[] = {
|
||||
/* { bank_type, hwid_mcatype, xec_bitmap } */
|
||||
|
||||
|
@ -407,7 +421,9 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
|
|||
(deferred_error_int_vector != amd_deferred_error_interrupt))
|
||||
deferred_error_int_vector = amd_deferred_error_interrupt;
|
||||
|
||||
if (!mce_flags.smca)
|
||||
low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC;
|
||||
|
||||
wrmsr(MSR_CU_DEF_ERR, low, high);
|
||||
}
|
||||
|
||||
|
@ -738,6 +754,17 @@ out_err:
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);
|
||||
|
||||
bool amd_mce_is_memory_error(struct mce *m)
|
||||
{
|
||||
/* ErrCodeExt[20:16] */
|
||||
u8 xec = (m->status >> 16) & 0x1f;
|
||||
|
||||
if (mce_flags.smca)
|
||||
return smca_get_bank_type(m) == SMCA_UMC && xec == 0x0;
|
||||
|
||||
return m->bank == 4 && xec == 0x8;
|
||||
}
|
||||
|
||||
static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
|
||||
{
|
||||
struct mce m;
|
||||
|
|
Loading…
Reference in New Issue