IB/ipath: Fix and recover TXE piobuf and PBC parity errors

We can sometimes trigger parity errors due to processor speculative
reads to our write-combined memory (mostly seen on Woodcrest).   Add a
stats counter for these.

Factored out the sendbuffererror buffer cancellation code so it can be
used in the new handling; suppress likely subsequent error messages if
within two jiffies of the cancellation.

Also restore 2 dropped TXE lines on hwe_bitsextant noticed while
debugging.

Signed-off-by: Bryan O'Sullivan <bryan.osullivan@qlogic.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
This commit is contained in:
Bryan O'Sullivan 2006-09-28 09:00:18 -07:00 committed by Roland Dreier
parent 510847750c
commit 89d1e09b6a
5 changed files with 124 additions and 52 deletions

View File

@ -141,8 +141,9 @@ struct infinipath_stats {
* packets if ipath not configured, etc.)
*/
__u64 sps_krdrops;
__u64 sps_txeparity; /* PIO buffer parity error, recovered */
/* pad for future growth */
__u64 __sps_pad[46];
__u64 __sps_pad[45];
};
/*

View File

@ -451,7 +451,10 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
* make sure we get this much out, unless told to be quiet,
* or it's occurred within the last 5 seconds
*/
if ((hwerrs & ~dd->ipath_lasthwerror) ||
if ((hwerrs & ~(dd->ipath_lasthwerror |
((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
<< INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT))) ||
(ipath_debug & __IPATH_VERBDBG))
dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx "
"(cleared)\n", (unsigned long long) hwerrs);
@ -464,6 +467,33 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
if (ctrl & INFINIPATH_C_FREEZEMODE) {
/*
* parity errors in send memory are recoverable,
* just cancel the send (if indicated in * sendbuffererror),
* count the occurrence, unfreeze (if no other handled
* hardware error bits are set), and continue. They can
* occur if a processor speculative read is done to the PIO
* buffer while we are sending a packet, for example.
*/
if (hwerrs & ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
<< INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) {
ipath_stats.sps_txeparity++;
ipath_dbg("Recovering from TXE parity error (%llu), "
"hwerrstatus=%llx\n",
(unsigned long long) ipath_stats.sps_txeparity,
(unsigned long long) hwerrs);
ipath_disarm_senderrbufs(dd);
hwerrs &= ~((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
<< INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT);
if (!hwerrs) { /* else leave in freeze mode */
ipath_write_kreg(dd,
dd->ipath_kregs->kr_control,
dd->ipath_control);
return;
}
}
if (hwerrs) {
/*
* if any set that we aren't ignoring; only

View File

@ -370,7 +370,10 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg,
* make sure we get this much out, unless told to be quiet,
* or it's occurred within the last 5 seconds
*/
if ((hwerrs & ~dd->ipath_lasthwerror) ||
if ((hwerrs & ~(dd->ipath_lasthwerror |
((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
<< INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT))) ||
(ipath_debug & __IPATH_VERBDBG))
dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx "
"(cleared)\n", (unsigned long long) hwerrs);
@ -383,6 +386,33 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg,
ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
if (ctrl & INFINIPATH_C_FREEZEMODE) {
/*
* parity errors in send memory are recoverable,
* just cancel the send (if indicated in * sendbuffererror),
* count the occurrence, unfreeze (if no other handled
* hardware error bits are set), and continue. They can
* occur if a processor speculative read is done to the PIO
* buffer while we are sending a packet, for example.
*/
if (hwerrs & ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
<< INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) {
ipath_stats.sps_txeparity++;
ipath_dbg("Recovering from TXE parity error (%llu), "
"hwerrstatus=%llx\n",
(unsigned long long) ipath_stats.sps_txeparity,
(unsigned long long) hwerrs);
ipath_disarm_senderrbufs(dd);
hwerrs &= ~((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
<< INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT);
if (!hwerrs) { /* else leave in freeze mode */
ipath_write_kreg(dd,
dd->ipath_kregs->kr_control,
dd->ipath_control);
return;
}
}
if (hwerrs) {
/*
* if any set that we aren't ignoring only make the
@ -406,9 +436,8 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg,
} else {
ipath_dbg("Clearing freezemode on ignored hardware "
"error\n");
ctrl &= ~INFINIPATH_C_FREEZEMODE;
ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
ctrl);
dd->ipath_control);
}
}
@ -880,6 +909,8 @@ static void ipath_init_pe_variables(struct ipath_devdata *dd)
dd->ipath_hwe_bitsextant =
(INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) |
(INFINIPATH_HWE_TXEMEMPARITYERR_MASK <<
INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) |
(INFINIPATH_HWE_PCIEMEMPARITYERR_MASK <<
INFINIPATH_HWE_PCIEMEMPARITYERR_SHIFT) |
INFINIPATH_HWE_PCIE1PLLFAILED |

View File

@ -37,6 +37,50 @@
#include "ipath_verbs.h"
#include "ipath_common.h"
/*
* Called when we might have an error that is specific to a particular
* PIO buffer, and may need to cancel that buffer, so it can be re-used.
*/
void ipath_disarm_senderrbufs(struct ipath_devdata *dd)
{
u32 piobcnt;
unsigned long sbuf[4];
/*
* it's possible that sendbuffererror could have bits set; might
* have already done this as a result of hardware error handling
*/
piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
/* read these before writing errorclear */
sbuf[0] = ipath_read_kreg64(
dd, dd->ipath_kregs->kr_sendbuffererror);
sbuf[1] = ipath_read_kreg64(
dd, dd->ipath_kregs->kr_sendbuffererror + 1);
if (piobcnt > 128) {
sbuf[2] = ipath_read_kreg64(
dd, dd->ipath_kregs->kr_sendbuffererror + 2);
sbuf[3] = ipath_read_kreg64(
dd, dd->ipath_kregs->kr_sendbuffererror + 3);
}
if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) {
int i;
if (ipath_debug & (__IPATH_PKTDBG|__IPATH_DBG)) {
__IPATH_DBG_WHICH(__IPATH_PKTDBG|__IPATH_DBG,
"SendbufErrs %lx %lx", sbuf[0],
sbuf[1]);
if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128)
printk(" %lx %lx ", sbuf[2], sbuf[3]);
printk("\n");
}
for (i = 0; i < piobcnt; i++)
if (test_bit(i, sbuf))
ipath_disarm_piobufs(dd, i, 1);
dd->ipath_lastcancel = jiffies+3; /* no armlaunch for a bit */
}
}
/* These are all rcv-related errors which we want to count for stats */
#define E_SUM_PKTERRS \
(INFINIPATH_E_RHDRLEN | INFINIPATH_E_RBADTID | \
@ -68,53 +112,9 @@
static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs)
{
unsigned long sbuf[4];
u64 ignore_this_time = 0;
u32 piobcnt;
/* if possible that sendbuffererror could be valid */
piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
/* read these before writing errorclear */
sbuf[0] = ipath_read_kreg64(
dd, dd->ipath_kregs->kr_sendbuffererror);
sbuf[1] = ipath_read_kreg64(
dd, dd->ipath_kregs->kr_sendbuffererror + 1);
if (piobcnt > 128) {
sbuf[2] = ipath_read_kreg64(
dd, dd->ipath_kregs->kr_sendbuffererror + 2);
sbuf[3] = ipath_read_kreg64(
dd, dd->ipath_kregs->kr_sendbuffererror + 3);
}
if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) {
int i;
ipath_cdbg(PKT, "SendbufErrs %lx %lx ", sbuf[0], sbuf[1]);
if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128)
printk("%lx %lx ", sbuf[2], sbuf[3]);
for (i = 0; i < piobcnt; i++) {
if (test_bit(i, sbuf)) {
u32 __iomem *piobuf;
if (i < dd->ipath_piobcnt2k)
piobuf = (u32 __iomem *)
(dd->ipath_pio2kbase +
i * dd->ipath_palign);
else
piobuf = (u32 __iomem *)
(dd->ipath_pio4kbase +
(i - dd->ipath_piobcnt2k) *
dd->ipath_4kalign);
ipath_cdbg(PKT,
"PIObuf[%u] @%p pbc is %x; ",
i, piobuf, readl(piobuf));
ipath_disarm_piobufs(dd, i, 1);
}
}
if (ipath_debug & __IPATH_PKTDBG)
printk("\n");
}
ipath_disarm_senderrbufs(dd);
if ((errs & E_SUM_LINK_PKTERRS) &&
!(dd->ipath_flags & IPATH_LINKACTIVE)) {
/*
@ -554,6 +554,14 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
~(INFINIPATH_E_HARDWARE |
INFINIPATH_E_IBSTATUSCHANGED);
}
/* likely due to cancel, so suppress */
if ((errs & (INFINIPATH_E_SPKTLEN | INFINIPATH_E_SPIOARMLAUNCH)) &&
dd->ipath_lastcancel > jiffies) {
ipath_dbg("Suppressed armlaunch/spktlen after error send cancel\n");
errs &= ~(INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SPKTLEN);
}
if (!errs)
return 0;

View File

@ -427,6 +427,9 @@ struct ipath_devdata {
unsigned long ipath_rcvctrl;
/* shadow kr_sendctrl */
unsigned long ipath_sendctrl;
/* ports waiting for PIOavail intr */
unsigned long ipath_portpiowait;
unsigned long ipath_lastcancel; /* to not count armlaunch after cancel */
/* value we put in kr_rcvhdrcnt */
u32 ipath_rcvhdrcnt;
@ -490,8 +493,6 @@ struct ipath_devdata {
u32 ipath_htwidth;
/* HT speed (200,400,800,1000) from HT config */
u32 ipath_htspeed;
/* ports waiting for PIOavail intr */
unsigned long ipath_portpiowait;
/*
* number of sequential ibcstatus change for polling active/quiet
* (i.e., link not coming up).
@ -585,6 +586,7 @@ int ipath_enable_wc(struct ipath_devdata *dd);
void ipath_disable_wc(struct ipath_devdata *dd);
int ipath_count_units(int *npresentp, int *nupp, u32 *maxportsp);
void ipath_shutdown_device(struct ipath_devdata *);
void ipath_disarm_senderrbufs(struct ipath_devdata *);
struct file_operations;
int ipath_cdev_init(int minor, char *name, struct file_operations *fops,