Use perf/event tracing to report PCI Express advanced errors.
-----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.11 (GNU/Linux) iQIcBAABAgAGBQJQ5zH/AAoJEKurIx+X31iByOsP/iNc+1ltI6fKemug/z8m9QQs y1oNjtI9boCGglpqRpQ0+v3D9Y9w259T8hO/2wcbQ6te+KvhCzOcXaHtcA9DTN3L FIWkK0l5nPmEPKC0tM3EWzdWb1vUXiNCD+ouwZoXwUFRuy9Gq/Kzl00/xrYDinuK Ogb59DmqBdtv34cUgJozqg2vdKuBI88+Usg9ervSXwVdg4NBaEvJjnesFrmSz6dN U8gGW+YAY2gEpZ32KGbofMLUJUtwru0BtR1vkIgY23magvkqNkNExcODAPpRbOIf +Vfv2hWP/i9dRFh4cnxwVb6fSjQ9JFr7iXwK6SLQ9XvK35tN2QLTiU7z/9IfXPTd dZrW6HviugvKOi8vS/e7S6nb/7KfQtj1Ix5K0u9rFMWUdDKvhC1EfVMU0maBQ6ZP OhRmJqPMxG+1zHJ0zIhhiut/pmMrDRLK9MFs71Ger4/V78LmIwVQlZ+8fwslkA6L ceKzECWhBi2JRZKJKo379oJwROxYoCkx39uxcBrCEZ6RogcFQ2kyYSSBKbt7RJ24 5f2M8H98b+i9LQSoRYrDPZN4n5TQoG6LP+GmkdwZTNKDUZ4CUjBhnLEylm1ups+s q8Ji1dHzJcmBnRsRhwnreef1Q7Bty3ZwALn0kqvUyeoY9y5NGuYA8tb28P8l7tVa WYSo98WUb7+czTgpzOIh =nIcy -----END PGP SIGNATURE----- Merge tag 'please-pull-aer-trace' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras into perf/core Use perf/event tracing to report PCI Express advanced errors, by Tony Luck. Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
commit
7c3c867f8d
|
@ -29,6 +29,7 @@
|
|||
#include <linux/time.h>
|
||||
#include <linux/cper.h>
|
||||
#include <linux/acpi.h>
|
||||
#include <linux/pci.h>
|
||||
#include <linux/aer.h>
|
||||
|
||||
/*
|
||||
|
@ -249,6 +250,10 @@ static const char *cper_pcie_port_type_strs[] = {
|
|||
static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie,
|
||||
const struct acpi_hest_generic_data *gdata)
|
||||
{
|
||||
#ifdef CONFIG_ACPI_APEI_PCIEAER
|
||||
struct pci_dev *dev;
|
||||
#endif
|
||||
|
||||
if (pcie->validation_bits & CPER_PCIE_VALID_PORT_TYPE)
|
||||
printk("%s""port_type: %d, %s\n", pfx, pcie->port_type,
|
||||
pcie->port_type < ARRAY_SIZE(cper_pcie_port_type_strs) ?
|
||||
|
@ -281,10 +286,18 @@ static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie,
|
|||
"%s""bridge: secondary_status: 0x%04x, control: 0x%04x\n",
|
||||
pfx, pcie->bridge.secondary_status, pcie->bridge.control);
|
||||
#ifdef CONFIG_ACPI_APEI_PCIEAER
|
||||
if (pcie->validation_bits & CPER_PCIE_VALID_AER_INFO) {
|
||||
struct aer_capability_regs *aer_regs = (void *)pcie->aer_info;
|
||||
cper_print_aer(pfx, gdata->error_severity, aer_regs);
|
||||
dev = pci_get_domain_bus_and_slot(pcie->device_id.segment,
|
||||
pcie->device_id.bus, pcie->device_id.function);
|
||||
if (!dev) {
|
||||
pr_err("PCI AER Cannot get PCI device %04x:%02x:%02x.%d\n",
|
||||
pcie->device_id.segment, pcie->device_id.bus,
|
||||
pcie->device_id.slot, pcie->device_id.function);
|
||||
return;
|
||||
}
|
||||
if (pcie->validation_bits & CPER_PCIE_VALID_AER_INFO)
|
||||
cper_print_aer(pfx, dev, gdata->error_severity,
|
||||
(struct aer_capability_regs *) pcie->aer_info);
|
||||
pci_dev_put(dev);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -23,6 +23,9 @@
|
|||
|
||||
#include "aerdrv.h"
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include <trace/events/ras.h>
|
||||
|
||||
#define AER_AGENT_RECEIVER 0
|
||||
#define AER_AGENT_REQUESTER 1
|
||||
#define AER_AGENT_COMPLETER 2
|
||||
|
@ -121,12 +124,11 @@ static const char *aer_agent_string[] = {
|
|||
"Transmitter ID"
|
||||
};
|
||||
|
||||
static void __aer_print_error(const char *prefix,
|
||||
static void __aer_print_error(struct pci_dev *dev,
|
||||
struct aer_err_info *info)
|
||||
{
|
||||
int i, status;
|
||||
const char *errmsg = NULL;
|
||||
|
||||
status = (info->status & ~info->mask);
|
||||
|
||||
for (i = 0; i < 32; i++) {
|
||||
|
@ -141,26 +143,22 @@ static void __aer_print_error(const char *prefix,
|
|||
aer_uncorrectable_error_string[i] : NULL;
|
||||
|
||||
if (errmsg)
|
||||
printk("%s"" [%2d] %-22s%s\n", prefix, i, errmsg,
|
||||
dev_err(&dev->dev, " [%2d] %-22s%s\n", i, errmsg,
|
||||
info->first_error == i ? " (First)" : "");
|
||||
else
|
||||
printk("%s"" [%2d] Unknown Error Bit%s\n", prefix, i,
|
||||
info->first_error == i ? " (First)" : "");
|
||||
dev_err(&dev->dev, " [%2d] Unknown Error Bit%s\n",
|
||||
i, info->first_error == i ? " (First)" : "");
|
||||
}
|
||||
}
|
||||
|
||||
void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
|
||||
{
|
||||
int id = ((dev->bus->number << 8) | dev->devfn);
|
||||
char prefix[44];
|
||||
|
||||
snprintf(prefix, sizeof(prefix), "%s%s %s: ",
|
||||
(info->severity == AER_CORRECTABLE) ? KERN_WARNING : KERN_ERR,
|
||||
dev_driver_string(&dev->dev), dev_name(&dev->dev));
|
||||
|
||||
if (info->status == 0) {
|
||||
printk("%s""PCIe Bus Error: severity=%s, type=Unaccessible, "
|
||||
"id=%04x(Unregistered Agent ID)\n", prefix,
|
||||
dev_err(&dev->dev,
|
||||
"PCIe Bus Error: severity=%s, type=Unaccessible, "
|
||||
"id=%04x(Unregistered Agent ID)\n",
|
||||
aer_error_severity_string[info->severity], id);
|
||||
} else {
|
||||
int layer, agent;
|
||||
|
@ -168,22 +166,24 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
|
|||
layer = AER_GET_LAYER_ERROR(info->severity, info->status);
|
||||
agent = AER_GET_AGENT(info->severity, info->status);
|
||||
|
||||
printk("%s""PCIe Bus Error: severity=%s, type=%s, id=%04x(%s)\n",
|
||||
prefix, aer_error_severity_string[info->severity],
|
||||
dev_err(&dev->dev,
|
||||
"PCIe Bus Error: severity=%s, type=%s, id=%04x(%s)\n",
|
||||
aer_error_severity_string[info->severity],
|
||||
aer_error_layer[layer], id, aer_agent_string[agent]);
|
||||
|
||||
printk("%s"" device [%04x:%04x] error status/mask=%08x/%08x\n",
|
||||
prefix, dev->vendor, dev->device,
|
||||
dev_err(&dev->dev,
|
||||
" device [%04x:%04x] error status/mask=%08x/%08x\n",
|
||||
dev->vendor, dev->device,
|
||||
info->status, info->mask);
|
||||
|
||||
__aer_print_error(prefix, info);
|
||||
__aer_print_error(dev, info);
|
||||
|
||||
if (info->tlp_header_valid) {
|
||||
unsigned char *tlp = (unsigned char *) &info->tlp;
|
||||
printk("%s"" TLP Header:"
|
||||
dev_err(&dev->dev, " TLP Header:"
|
||||
" %02x%02x%02x%02x %02x%02x%02x%02x"
|
||||
" %02x%02x%02x%02x %02x%02x%02x%02x\n",
|
||||
prefix, *(tlp + 3), *(tlp + 2), *(tlp + 1), *tlp,
|
||||
*(tlp + 3), *(tlp + 2), *(tlp + 1), *tlp,
|
||||
*(tlp + 7), *(tlp + 6), *(tlp + 5), *(tlp + 4),
|
||||
*(tlp + 11), *(tlp + 10), *(tlp + 9),
|
||||
*(tlp + 8), *(tlp + 15), *(tlp + 14),
|
||||
|
@ -192,8 +192,11 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info)
|
|||
}
|
||||
|
||||
if (info->id && info->error_dev_num > 1 && info->id == id)
|
||||
printk("%s"" Error of this Agent(%04x) is reported first\n",
|
||||
prefix, id);
|
||||
dev_err(&dev->dev,
|
||||
" Error of this Agent(%04x) is reported first\n",
|
||||
id);
|
||||
trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask),
|
||||
info->severity);
|
||||
}
|
||||
|
||||
void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info)
|
||||
|
@ -217,7 +220,7 @@ int cper_severity_to_aer(int cper_severity)
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(cper_severity_to_aer);
|
||||
|
||||
void cper_print_aer(const char *prefix, int cper_severity,
|
||||
void cper_print_aer(const char *prefix, struct pci_dev *dev, int cper_severity,
|
||||
struct aer_capability_regs *aer)
|
||||
{
|
||||
int aer_severity, layer, agent, status_strs_size, tlp_header_valid = 0;
|
||||
|
@ -239,25 +242,27 @@ void cper_print_aer(const char *prefix, int cper_severity,
|
|||
}
|
||||
layer = AER_GET_LAYER_ERROR(aer_severity, status);
|
||||
agent = AER_GET_AGENT(aer_severity, status);
|
||||
printk("%s""aer_status: 0x%08x, aer_mask: 0x%08x\n",
|
||||
prefix, status, mask);
|
||||
dev_err(&dev->dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n",
|
||||
status, mask);
|
||||
cper_print_bits(prefix, status, status_strs, status_strs_size);
|
||||
printk("%s""aer_layer=%s, aer_agent=%s\n", prefix,
|
||||
dev_err(&dev->dev, "aer_layer=%s, aer_agent=%s\n",
|
||||
aer_error_layer[layer], aer_agent_string[agent]);
|
||||
if (aer_severity != AER_CORRECTABLE)
|
||||
printk("%s""aer_uncor_severity: 0x%08x\n",
|
||||
prefix, aer->uncor_severity);
|
||||
dev_err(&dev->dev, "aer_uncor_severity: 0x%08x\n",
|
||||
aer->uncor_severity);
|
||||
if (tlp_header_valid) {
|
||||
const unsigned char *tlp;
|
||||
tlp = (const unsigned char *)&aer->header_log;
|
||||
printk("%s""aer_tlp_header:"
|
||||
dev_err(&dev->dev, "aer_tlp_header:"
|
||||
" %02x%02x%02x%02x %02x%02x%02x%02x"
|
||||
" %02x%02x%02x%02x %02x%02x%02x%02x\n",
|
||||
prefix, *(tlp + 3), *(tlp + 2), *(tlp + 1), *tlp,
|
||||
*(tlp + 3), *(tlp + 2), *(tlp + 1), *tlp,
|
||||
*(tlp + 7), *(tlp + 6), *(tlp + 5), *(tlp + 4),
|
||||
*(tlp + 11), *(tlp + 10), *(tlp + 9),
|
||||
*(tlp + 8), *(tlp + 15), *(tlp + 14),
|
||||
*(tlp + 13), *(tlp + 12));
|
||||
}
|
||||
trace_aer_event(dev_name(&dev->dev), (status & ~mask),
|
||||
aer_severity);
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -49,8 +49,8 @@ static inline int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
|
|||
}
|
||||
#endif
|
||||
|
||||
extern void cper_print_aer(const char *prefix, int cper_severity,
|
||||
struct aer_capability_regs *aer);
|
||||
extern void cper_print_aer(const char *prefix, struct pci_dev *dev,
|
||||
int cper_severity, struct aer_capability_regs *aer);
|
||||
extern int cper_severity_to_aer(int cper_severity);
|
||||
extern void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
|
||||
int severity);
|
||||
|
|
|
@ -0,0 +1,77 @@
|
|||
#undef TRACE_SYSTEM
|
||||
#define TRACE_SYSTEM ras
|
||||
|
||||
#if !defined(_TRACE_AER_H) || defined(TRACE_HEADER_MULTI_READ)
|
||||
#define _TRACE_AER_H
|
||||
|
||||
#include <linux/tracepoint.h>
|
||||
#include <linux/edac.h>
|
||||
|
||||
|
||||
/*
|
||||
* PCIe AER Trace event
|
||||
*
|
||||
* These events are generated when hardware detects a corrected or
|
||||
* uncorrected event on a PCIe device. The event report has
|
||||
* the following structure:
|
||||
*
|
||||
* char * dev_name - The name of the slot where the device resides
|
||||
* ([domain:]bus:device.function).
|
||||
* u32 status - Either the correctable or uncorrectable register
|
||||
* indicating what error or errors have been seen
|
||||
* u8 severity - error severity 0:NONFATAL 1:FATAL 2:CORRECTED
|
||||
*/
|
||||
|
||||
#define aer_correctable_errors \
|
||||
{BIT(0), "Receiver Error"}, \
|
||||
{BIT(6), "Bad TLP"}, \
|
||||
{BIT(7), "Bad DLLP"}, \
|
||||
{BIT(8), "RELAY_NUM Rollover"}, \
|
||||
{BIT(12), "Replay Timer Timeout"}, \
|
||||
{BIT(13), "Advisory Non-Fatal"}
|
||||
|
||||
#define aer_uncorrectable_errors \
|
||||
{BIT(4), "Data Link Protocol"}, \
|
||||
{BIT(12), "Poisoned TLP"}, \
|
||||
{BIT(13), "Flow Control Protocol"}, \
|
||||
{BIT(14), "Completion Timeout"}, \
|
||||
{BIT(15), "Completer Abort"}, \
|
||||
{BIT(16), "Unexpected Completion"}, \
|
||||
{BIT(17), "Receiver Overflow"}, \
|
||||
{BIT(18), "Malformed TLP"}, \
|
||||
{BIT(19), "ECRC"}, \
|
||||
{BIT(20), "Unsupported Request"}
|
||||
|
||||
TRACE_EVENT(aer_event,
|
||||
TP_PROTO(const char *dev_name,
|
||||
const u32 status,
|
||||
const u8 severity),
|
||||
|
||||
TP_ARGS(dev_name, status, severity),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__string( dev_name, dev_name )
|
||||
__field( u32, status )
|
||||
__field( u8, severity )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__assign_str(dev_name, dev_name);
|
||||
__entry->status = status;
|
||||
__entry->severity = severity;
|
||||
),
|
||||
|
||||
TP_printk("%s PCIe Bus Error: severity=%s, %s\n",
|
||||
__get_str(dev_name),
|
||||
__entry->severity == HW_EVENT_ERR_CORRECTED ? "Corrected" :
|
||||
__entry->severity == HW_EVENT_ERR_FATAL ?
|
||||
"Fatal" : "Uncorrected",
|
||||
__entry->severity == HW_EVENT_ERR_CORRECTED ?
|
||||
__print_flags(__entry->status, "|", aer_correctable_errors) :
|
||||
__print_flags(__entry->status, "|", aer_uncorrectable_errors))
|
||||
);
|
||||
|
||||
#endif /* _TRACE_AER_H */
|
||||
|
||||
/* This part must be outside protection */
|
||||
#include <trace/define_trace.h>
|
Loading…
Reference in New Issue