powerpc/eeh: Dump PHB diag-data early
On PowerNV platform, PHB diag-data is dumped after stopping device drivers. In case of recursive EEH errors, the kernel is usually crashed before dumping PHB diag-data for the second EEH error. It's hard to locate the root cause of the second EEH error without PHB diag-data. The patch adds one more EEH option "eeh=early_log", which helps dumping PHB diag-data immediately once frozen PE is detected, in order to get the PHB diag-data for the second EEH error. Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
This commit is contained in:
parent
b1d76a7d57
commit
a450e8f55a
arch/powerpc
|
@ -39,6 +39,7 @@ struct device_node;
|
||||||
#define EEH_PROBE_MODE_DEV 0x04 /* From PCI device */
|
#define EEH_PROBE_MODE_DEV 0x04 /* From PCI device */
|
||||||
#define EEH_PROBE_MODE_DEVTREE 0x08 /* From device tree */
|
#define EEH_PROBE_MODE_DEVTREE 0x08 /* From device tree */
|
||||||
#define EEH_ENABLE_IO_FOR_LOG 0x10 /* Enable IO for log */
|
#define EEH_ENABLE_IO_FOR_LOG 0x10 /* Enable IO for log */
|
||||||
|
#define EEH_EARLY_DUMP_LOG 0x20 /* Dump log immediately */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Delay for PE reset, all in ms
|
* Delay for PE reset, all in ms
|
||||||
|
|
|
@ -143,6 +143,8 @@ static int __init eeh_setup(char *str)
|
||||||
{
|
{
|
||||||
if (!strcmp(str, "off"))
|
if (!strcmp(str, "off"))
|
||||||
eeh_add_flag(EEH_FORCE_DISABLED);
|
eeh_add_flag(EEH_FORCE_DISABLED);
|
||||||
|
else if (!strcmp(str, "early_log"))
|
||||||
|
eeh_add_flag(EEH_EARLY_DUMP_LOG);
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
|
@ -353,6 +353,9 @@ static int ioda_eeh_get_phb_state(struct eeh_pe *pe)
|
||||||
} else if (!(pe->state & EEH_PE_ISOLATED)) {
|
} else if (!(pe->state & EEH_PE_ISOLATED)) {
|
||||||
eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
|
eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
|
||||||
ioda_eeh_phb_diag(pe);
|
ioda_eeh_phb_diag(pe);
|
||||||
|
|
||||||
|
if (eeh_has_flag(EEH_EARLY_DUMP_LOG))
|
||||||
|
pnv_pci_dump_phb_diag_data(pe->phb, pe->data);
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
@ -451,6 +454,9 @@ static int ioda_eeh_get_pe_state(struct eeh_pe *pe)
|
||||||
|
|
||||||
eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
|
eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
|
||||||
ioda_eeh_phb_diag(pe);
|
ioda_eeh_phb_diag(pe);
|
||||||
|
|
||||||
|
if (eeh_has_flag(EEH_EARLY_DUMP_LOG))
|
||||||
|
pnv_pci_dump_phb_diag_data(pe->phb, pe->data);
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
@ -730,7 +736,8 @@ static int ioda_eeh_reset(struct eeh_pe *pe, int option)
|
||||||
static int ioda_eeh_get_log(struct eeh_pe *pe, int severity,
|
static int ioda_eeh_get_log(struct eeh_pe *pe, int severity,
|
||||||
char *drv_log, unsigned long len)
|
char *drv_log, unsigned long len)
|
||||||
{
|
{
|
||||||
pnv_pci_dump_phb_diag_data(pe->phb, pe->data);
|
if (!eeh_has_flag(EEH_EARLY_DUMP_LOG))
|
||||||
|
pnv_pci_dump_phb_diag_data(pe->phb, pe->data);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -1086,6 +1093,10 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
|
||||||
!((*pe)->state & EEH_PE_ISOLATED)) {
|
!((*pe)->state & EEH_PE_ISOLATED)) {
|
||||||
eeh_pe_state_mark(*pe, EEH_PE_ISOLATED);
|
eeh_pe_state_mark(*pe, EEH_PE_ISOLATED);
|
||||||
ioda_eeh_phb_diag(*pe);
|
ioda_eeh_phb_diag(*pe);
|
||||||
|
|
||||||
|
if (eeh_has_flag(EEH_EARLY_DUMP_LOG))
|
||||||
|
pnv_pci_dump_phb_diag_data((*pe)->phb,
|
||||||
|
(*pe)->data);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
Loading…
Reference in New Issue