Commit 89d1e09b authored by Bryan O'Sullivan's avatar Bryan O'Sullivan Committed by Roland Dreier

IB/ipath: Fix and recover TXE piobuf and PBC parity errors

We can sometimes trigger parity errors due to processor speculative
reads to our write-combined memory (mostly seen on Woodcrest).   Add a
stats counter for these.

Factored out the sendbuffererror buffer cancellation code so it can be
used in the new handling; suppress likely subsequent error messages if
within two jiffies of the cancellation.

Also restore 2 dropped TXE lines on hwe_bitsextant noticed while
debugging.
Signed-off-by: default avatarBryan O'Sullivan <bryan.osullivan@qlogic.com>
Signed-off-by: default avatarRoland Dreier <rolandd@cisco.com>
parent 51084775
...@@ -141,8 +141,9 @@ struct infinipath_stats { ...@@ -141,8 +141,9 @@ struct infinipath_stats {
* packets if ipath not configured, etc.) * packets if ipath not configured, etc.)
*/ */
__u64 sps_krdrops; __u64 sps_krdrops;
__u64 sps_txeparity; /* PIO buffer parity error, recovered */
/* pad for future growth */ /* pad for future growth */
__u64 __sps_pad[46]; __u64 __sps_pad[45];
}; };
/* /*
......
...@@ -451,7 +451,10 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg, ...@@ -451,7 +451,10 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
* make sure we get this much out, unless told to be quiet, * make sure we get this much out, unless told to be quiet,
* or it's occurred within the last 5 seconds * or it's occurred within the last 5 seconds
*/ */
if ((hwerrs & ~dd->ipath_lasthwerror) || if ((hwerrs & ~(dd->ipath_lasthwerror |
((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
<< INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT))) ||
(ipath_debug & __IPATH_VERBDBG)) (ipath_debug & __IPATH_VERBDBG))
dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx " dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx "
"(cleared)\n", (unsigned long long) hwerrs); "(cleared)\n", (unsigned long long) hwerrs);
...@@ -464,6 +467,33 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg, ...@@ -464,6 +467,33 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control); ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
if (ctrl & INFINIPATH_C_FREEZEMODE) { if (ctrl & INFINIPATH_C_FREEZEMODE) {
/*
* parity errors in send memory are recoverable,
* just cancel the send (if indicated in * sendbuffererror),
* count the occurrence, unfreeze (if no other handled
* hardware error bits are set), and continue. They can
* occur if a processor speculative read is done to the PIO
* buffer while we are sending a packet, for example.
*/
if (hwerrs & ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
<< INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) {
ipath_stats.sps_txeparity++;
ipath_dbg("Recovering from TXE parity error (%llu), "
"hwerrstatus=%llx\n",
(unsigned long long) ipath_stats.sps_txeparity,
(unsigned long long) hwerrs);
ipath_disarm_senderrbufs(dd);
hwerrs &= ~((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
<< INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT);
if (!hwerrs) { /* else leave in freeze mode */
ipath_write_kreg(dd,
dd->ipath_kregs->kr_control,
dd->ipath_control);
return;
}
}
if (hwerrs) { if (hwerrs) {
/* /*
* if any set that we aren't ignoring; only * if any set that we aren't ignoring; only
......
...@@ -370,7 +370,10 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg, ...@@ -370,7 +370,10 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg,
* make sure we get this much out, unless told to be quiet, * make sure we get this much out, unless told to be quiet,
* or it's occurred within the last 5 seconds * or it's occurred within the last 5 seconds
*/ */
if ((hwerrs & ~dd->ipath_lasthwerror) || if ((hwerrs & ~(dd->ipath_lasthwerror |
((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
<< INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT))) ||
(ipath_debug & __IPATH_VERBDBG)) (ipath_debug & __IPATH_VERBDBG))
dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx " dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx "
"(cleared)\n", (unsigned long long) hwerrs); "(cleared)\n", (unsigned long long) hwerrs);
...@@ -383,6 +386,33 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg, ...@@ -383,6 +386,33 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg,
ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control); ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
if (ctrl & INFINIPATH_C_FREEZEMODE) { if (ctrl & INFINIPATH_C_FREEZEMODE) {
/*
* parity errors in send memory are recoverable,
* just cancel the send (if indicated in * sendbuffererror),
* count the occurrence, unfreeze (if no other handled
* hardware error bits are set), and continue. They can
* occur if a processor speculative read is done to the PIO
* buffer while we are sending a packet, for example.
*/
if (hwerrs & ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
<< INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) {
ipath_stats.sps_txeparity++;
ipath_dbg("Recovering from TXE parity error (%llu), "
"hwerrstatus=%llx\n",
(unsigned long long) ipath_stats.sps_txeparity,
(unsigned long long) hwerrs);
ipath_disarm_senderrbufs(dd);
hwerrs &= ~((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
<< INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT);
if (!hwerrs) { /* else leave in freeze mode */
ipath_write_kreg(dd,
dd->ipath_kregs->kr_control,
dd->ipath_control);
return;
}
}
if (hwerrs) { if (hwerrs) {
/* /*
* if any set that we aren't ignoring only make the * if any set that we aren't ignoring only make the
...@@ -406,9 +436,8 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg, ...@@ -406,9 +436,8 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg,
} else { } else {
ipath_dbg("Clearing freezemode on ignored hardware " ipath_dbg("Clearing freezemode on ignored hardware "
"error\n"); "error\n");
ctrl &= ~INFINIPATH_C_FREEZEMODE;
ipath_write_kreg(dd, dd->ipath_kregs->kr_control, ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
ctrl); dd->ipath_control);
} }
} }
...@@ -880,6 +909,8 @@ static void ipath_init_pe_variables(struct ipath_devdata *dd) ...@@ -880,6 +909,8 @@ static void ipath_init_pe_variables(struct ipath_devdata *dd)
dd->ipath_hwe_bitsextant = dd->ipath_hwe_bitsextant =
(INFINIPATH_HWE_RXEMEMPARITYERR_MASK << (INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) | INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) |
(INFINIPATH_HWE_TXEMEMPARITYERR_MASK <<
INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) |
(INFINIPATH_HWE_PCIEMEMPARITYERR_MASK << (INFINIPATH_HWE_PCIEMEMPARITYERR_MASK <<
INFINIPATH_HWE_PCIEMEMPARITYERR_SHIFT) | INFINIPATH_HWE_PCIEMEMPARITYERR_SHIFT) |
INFINIPATH_HWE_PCIE1PLLFAILED | INFINIPATH_HWE_PCIE1PLLFAILED |
......
...@@ -37,6 +37,50 @@ ...@@ -37,6 +37,50 @@
#include "ipath_verbs.h" #include "ipath_verbs.h"
#include "ipath_common.h" #include "ipath_common.h"
/*
* Called when we might have an error that is specific to a particular
* PIO buffer, and may need to cancel that buffer, so it can be re-used.
*/
void ipath_disarm_senderrbufs(struct ipath_devdata *dd)
{
u32 piobcnt;
unsigned long sbuf[4];
/*
* it's possible that sendbuffererror could have bits set; might
* have already done this as a result of hardware error handling
*/
piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
/* read these before writing errorclear */
sbuf[0] = ipath_read_kreg64(
dd, dd->ipath_kregs->kr_sendbuffererror);
sbuf[1] = ipath_read_kreg64(
dd, dd->ipath_kregs->kr_sendbuffererror + 1);
if (piobcnt > 128) {
sbuf[2] = ipath_read_kreg64(
dd, dd->ipath_kregs->kr_sendbuffererror + 2);
sbuf[3] = ipath_read_kreg64(
dd, dd->ipath_kregs->kr_sendbuffererror + 3);
}
if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) {
int i;
if (ipath_debug & (__IPATH_PKTDBG|__IPATH_DBG)) {
__IPATH_DBG_WHICH(__IPATH_PKTDBG|__IPATH_DBG,
"SendbufErrs %lx %lx", sbuf[0],
sbuf[1]);
if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128)
printk(" %lx %lx ", sbuf[2], sbuf[3]);
printk("\n");
}
for (i = 0; i < piobcnt; i++)
if (test_bit(i, sbuf))
ipath_disarm_piobufs(dd, i, 1);
dd->ipath_lastcancel = jiffies+3; /* no armlaunch for a bit */
}
}
/* These are all rcv-related errors which we want to count for stats */ /* These are all rcv-related errors which we want to count for stats */
#define E_SUM_PKTERRS \ #define E_SUM_PKTERRS \
(INFINIPATH_E_RHDRLEN | INFINIPATH_E_RBADTID | \ (INFINIPATH_E_RHDRLEN | INFINIPATH_E_RBADTID | \
...@@ -68,53 +112,9 @@ ...@@ -68,53 +112,9 @@
static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs) static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs)
{ {
unsigned long sbuf[4];
u64 ignore_this_time = 0; u64 ignore_this_time = 0;
u32 piobcnt;
/* if possible that sendbuffererror could be valid */
piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
/* read these before writing errorclear */
sbuf[0] = ipath_read_kreg64(
dd, dd->ipath_kregs->kr_sendbuffererror);
sbuf[1] = ipath_read_kreg64(
dd, dd->ipath_kregs->kr_sendbuffererror + 1);
if (piobcnt > 128) {
sbuf[2] = ipath_read_kreg64(
dd, dd->ipath_kregs->kr_sendbuffererror + 2);
sbuf[3] = ipath_read_kreg64(
dd, dd->ipath_kregs->kr_sendbuffererror + 3);
}
if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) { ipath_disarm_senderrbufs(dd);
int i;
ipath_cdbg(PKT, "SendbufErrs %lx %lx ", sbuf[0], sbuf[1]);
if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128)
printk("%lx %lx ", sbuf[2], sbuf[3]);
for (i = 0; i < piobcnt; i++) {
if (test_bit(i, sbuf)) {
u32 __iomem *piobuf;
if (i < dd->ipath_piobcnt2k)
piobuf = (u32 __iomem *)
(dd->ipath_pio2kbase +
i * dd->ipath_palign);
else
piobuf = (u32 __iomem *)
(dd->ipath_pio4kbase +
(i - dd->ipath_piobcnt2k) *
dd->ipath_4kalign);
ipath_cdbg(PKT,
"PIObuf[%u] @%p pbc is %x; ",
i, piobuf, readl(piobuf));
ipath_disarm_piobufs(dd, i, 1);
}
}
if (ipath_debug & __IPATH_PKTDBG)
printk("\n");
}
if ((errs & E_SUM_LINK_PKTERRS) && if ((errs & E_SUM_LINK_PKTERRS) &&
!(dd->ipath_flags & IPATH_LINKACTIVE)) { !(dd->ipath_flags & IPATH_LINKACTIVE)) {
/* /*
...@@ -554,6 +554,14 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) ...@@ -554,6 +554,14 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
~(INFINIPATH_E_HARDWARE | ~(INFINIPATH_E_HARDWARE |
INFINIPATH_E_IBSTATUSCHANGED); INFINIPATH_E_IBSTATUSCHANGED);
} }
/* likely due to cancel, so suppress */
if ((errs & (INFINIPATH_E_SPKTLEN | INFINIPATH_E_SPIOARMLAUNCH)) &&
dd->ipath_lastcancel > jiffies) {
ipath_dbg("Suppressed armlaunch/spktlen after error send cancel\n");
errs &= ~(INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SPKTLEN);
}
if (!errs) if (!errs)
return 0; return 0;
......
...@@ -427,6 +427,9 @@ struct ipath_devdata { ...@@ -427,6 +427,9 @@ struct ipath_devdata {
unsigned long ipath_rcvctrl; unsigned long ipath_rcvctrl;
/* shadow kr_sendctrl */ /* shadow kr_sendctrl */
unsigned long ipath_sendctrl; unsigned long ipath_sendctrl;
/* ports waiting for PIOavail intr */
unsigned long ipath_portpiowait;
unsigned long ipath_lastcancel; /* to not count armlaunch after cancel */
/* value we put in kr_rcvhdrcnt */ /* value we put in kr_rcvhdrcnt */
u32 ipath_rcvhdrcnt; u32 ipath_rcvhdrcnt;
...@@ -490,8 +493,6 @@ struct ipath_devdata { ...@@ -490,8 +493,6 @@ struct ipath_devdata {
u32 ipath_htwidth; u32 ipath_htwidth;
/* HT speed (200,400,800,1000) from HT config */ /* HT speed (200,400,800,1000) from HT config */
u32 ipath_htspeed; u32 ipath_htspeed;
/* ports waiting for PIOavail intr */
unsigned long ipath_portpiowait;
/* /*
* number of sequential ibcstatus change for polling active/quiet * number of sequential ibcstatus change for polling active/quiet
* (i.e., link not coming up). * (i.e., link not coming up).
...@@ -585,6 +586,7 @@ int ipath_enable_wc(struct ipath_devdata *dd); ...@@ -585,6 +586,7 @@ int ipath_enable_wc(struct ipath_devdata *dd);
void ipath_disable_wc(struct ipath_devdata *dd); void ipath_disable_wc(struct ipath_devdata *dd);
int ipath_count_units(int *npresentp, int *nupp, u32 *maxportsp); int ipath_count_units(int *npresentp, int *nupp, u32 *maxportsp);
void ipath_shutdown_device(struct ipath_devdata *); void ipath_shutdown_device(struct ipath_devdata *);
void ipath_disarm_senderrbufs(struct ipath_devdata *);
struct file_operations; struct file_operations;
int ipath_cdev_init(int minor, char *name, struct file_operations *fops, int ipath_cdev_init(int minor, char *name, struct file_operations *fops,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment