Commit b3b30f5e authored by Jack Morgenstein's avatar Jack Morgenstein Committed by Roland Dreier

IB/mthca: Recover from catastrophic errors

Trigger device remove and then add when a catastrophic error is
detected in hardware.  This, in turn, will cause a device reset, which
we hope will recover from the catastrophic condition.

Since this might interefere with debugging the root cause, add a
module option to suppress this behaviour.
Signed-off-by: default avatarJack Morgenstein <jackm@mellanox.co.il>
Signed-off-by: default avatarMichael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: default avatarRoland Dreier <rolandd@cisco.com>
parent 07eeec06
...@@ -34,6 +34,7 @@ ...@@ -34,6 +34,7 @@
#include <linux/jiffies.h> #include <linux/jiffies.h>
#include <linux/timer.h> #include <linux/timer.h>
#include <linux/workqueue.h>
#include "mthca_dev.h" #include "mthca_dev.h"
...@@ -48,9 +49,41 @@ enum { ...@@ -48,9 +49,41 @@ enum {
static DEFINE_SPINLOCK(catas_lock); static DEFINE_SPINLOCK(catas_lock);
static LIST_HEAD(catas_list);
static struct workqueue_struct *catas_wq;
static struct work_struct catas_work;
static int catas_reset_disable;
module_param_named(catas_reset_disable, catas_reset_disable, int, 0644);
MODULE_PARM_DESC(catas_reset_disable, "disable reset on catastrophic event if nonzero");
static void catas_reset(void *work_ptr)
{
struct mthca_dev *dev, *tmpdev;
LIST_HEAD(tlist);
int ret;
mutex_lock(&mthca_device_mutex);
spin_lock_irq(&catas_lock);
list_splice_init(&catas_list, &tlist);
spin_unlock_irq(&catas_lock);
list_for_each_entry_safe(dev, tmpdev, &tlist, catas_err.list) {
ret = __mthca_restart_one(dev->pdev);
if (ret)
mthca_err(dev, "Reset failed (%d)\n", ret);
else
mthca_dbg(dev, "Reset succeeded\n");
}
mutex_unlock(&mthca_device_mutex);
}
static void handle_catas(struct mthca_dev *dev) static void handle_catas(struct mthca_dev *dev)
{ {
struct ib_event event; struct ib_event event;
unsigned long flags;
const char *type; const char *type;
int i; int i;
...@@ -82,6 +115,14 @@ static void handle_catas(struct mthca_dev *dev) ...@@ -82,6 +115,14 @@ static void handle_catas(struct mthca_dev *dev)
for (i = 0; i < dev->catas_err.size; ++i) for (i = 0; i < dev->catas_err.size; ++i)
mthca_err(dev, " buf[%02x]: %08x\n", mthca_err(dev, " buf[%02x]: %08x\n",
i, swab32(readl(dev->catas_err.map + i))); i, swab32(readl(dev->catas_err.map + i)));
if (catas_reset_disable)
return;
spin_lock_irqsave(&catas_lock, flags);
list_add(&dev->catas_err.list, &catas_list);
queue_work(catas_wq, &catas_work);
spin_unlock_irqrestore(&catas_lock, flags);
} }
static void poll_catas(unsigned long dev_ptr) static void poll_catas(unsigned long dev_ptr)
...@@ -135,6 +176,7 @@ void mthca_start_catas_poll(struct mthca_dev *dev) ...@@ -135,6 +176,7 @@ void mthca_start_catas_poll(struct mthca_dev *dev)
dev->catas_err.timer.data = (unsigned long) dev; dev->catas_err.timer.data = (unsigned long) dev;
dev->catas_err.timer.function = poll_catas; dev->catas_err.timer.function = poll_catas;
dev->catas_err.timer.expires = jiffies + MTHCA_CATAS_POLL_INTERVAL; dev->catas_err.timer.expires = jiffies + MTHCA_CATAS_POLL_INTERVAL;
INIT_LIST_HEAD(&dev->catas_err.list);
add_timer(&dev->catas_err.timer); add_timer(&dev->catas_err.timer);
} }
...@@ -153,4 +195,24 @@ void mthca_stop_catas_poll(struct mthca_dev *dev) ...@@ -153,4 +195,24 @@ void mthca_stop_catas_poll(struct mthca_dev *dev)
dev->catas_err.addr), dev->catas_err.addr),
dev->catas_err.size * 4); dev->catas_err.size * 4);
} }
spin_lock_irq(&catas_lock);
list_del(&dev->catas_err.list);
spin_unlock_irq(&catas_lock);
}
int __init mthca_catas_init(void)
{
INIT_WORK(&catas_work, catas_reset, NULL);
catas_wq = create_singlethread_workqueue("mthca_catas");
if (!catas_wq)
return -ENOMEM;
return 0;
}
void mthca_catas_cleanup(void)
{
destroy_workqueue(catas_wq);
} }
...@@ -45,6 +45,7 @@ ...@@ -45,6 +45,7 @@
#include <linux/dma-mapping.h> #include <linux/dma-mapping.h>
#include <linux/timer.h> #include <linux/timer.h>
#include <linux/mutex.h> #include <linux/mutex.h>
#include <linux/list.h>
#include <asm/semaphore.h> #include <asm/semaphore.h>
...@@ -283,8 +284,11 @@ struct mthca_catas_err { ...@@ -283,8 +284,11 @@ struct mthca_catas_err {
unsigned long stop; unsigned long stop;
u32 size; u32 size;
struct timer_list timer; struct timer_list timer;
struct list_head list;
}; };
extern struct mutex mthca_device_mutex;
struct mthca_dev { struct mthca_dev {
struct ib_device ib_dev; struct ib_device ib_dev;
struct pci_dev *pdev; struct pci_dev *pdev;
...@@ -450,6 +454,9 @@ void mthca_unregister_device(struct mthca_dev *dev); ...@@ -450,6 +454,9 @@ void mthca_unregister_device(struct mthca_dev *dev);
void mthca_start_catas_poll(struct mthca_dev *dev); void mthca_start_catas_poll(struct mthca_dev *dev);
void mthca_stop_catas_poll(struct mthca_dev *dev); void mthca_stop_catas_poll(struct mthca_dev *dev);
int __mthca_restart_one(struct pci_dev *pdev);
int mthca_catas_init(void);
void mthca_catas_cleanup(void);
int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar); int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar);
void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar); void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar);
......
...@@ -80,6 +80,8 @@ static int tune_pci = 0; ...@@ -80,6 +80,8 @@ static int tune_pci = 0;
module_param(tune_pci, int, 0444); module_param(tune_pci, int, 0444);
MODULE_PARM_DESC(tune_pci, "increase PCI burst from the default set by BIOS if nonzero"); MODULE_PARM_DESC(tune_pci, "increase PCI burst from the default set by BIOS if nonzero");
struct mutex mthca_device_mutex;
static const char mthca_version[] __devinitdata = static const char mthca_version[] __devinitdata =
DRV_NAME ": Mellanox InfiniBand HCA driver v" DRV_NAME ": Mellanox InfiniBand HCA driver v"
DRV_VERSION " (" DRV_RELDATE ")\n"; DRV_VERSION " (" DRV_RELDATE ")\n";
...@@ -978,28 +980,15 @@ static struct { ...@@ -978,28 +980,15 @@ static struct {
MTHCA_FLAG_SINAI_OPT } MTHCA_FLAG_SINAI_OPT }
}; };
static int __devinit mthca_init_one(struct pci_dev *pdev, static int __mthca_init_one(struct pci_dev *pdev, int hca_type)
const struct pci_device_id *id)
{ {
static int mthca_version_printed = 0;
int ddr_hidden = 0; int ddr_hidden = 0;
int err; int err;
struct mthca_dev *mdev; struct mthca_dev *mdev;
if (!mthca_version_printed) {
printk(KERN_INFO "%s", mthca_version);
++mthca_version_printed;
}
printk(KERN_INFO PFX "Initializing %s\n", printk(KERN_INFO PFX "Initializing %s\n",
pci_name(pdev)); pci_name(pdev));
if (id->driver_data >= ARRAY_SIZE(mthca_hca_table)) {
printk(KERN_ERR PFX "%s has invalid driver data %lx\n",
pci_name(pdev), id->driver_data);
return -ENODEV;
}
err = pci_enable_device(pdev); err = pci_enable_device(pdev);
if (err) { if (err) {
dev_err(&pdev->dev, "Cannot enable PCI device, " dev_err(&pdev->dev, "Cannot enable PCI device, "
...@@ -1065,7 +1054,7 @@ static int __devinit mthca_init_one(struct pci_dev *pdev, ...@@ -1065,7 +1054,7 @@ static int __devinit mthca_init_one(struct pci_dev *pdev,
mdev->pdev = pdev; mdev->pdev = pdev;
mdev->mthca_flags = mthca_hca_table[id->driver_data].flags; mdev->mthca_flags = mthca_hca_table[hca_type].flags;
if (ddr_hidden) if (ddr_hidden)
mdev->mthca_flags |= MTHCA_FLAG_DDR_HIDDEN; mdev->mthca_flags |= MTHCA_FLAG_DDR_HIDDEN;
...@@ -1099,13 +1088,13 @@ static int __devinit mthca_init_one(struct pci_dev *pdev, ...@@ -1099,13 +1088,13 @@ static int __devinit mthca_init_one(struct pci_dev *pdev,
if (err) if (err)
goto err_cmd; goto err_cmd;
if (mdev->fw_ver < mthca_hca_table[id->driver_data].latest_fw) { if (mdev->fw_ver < mthca_hca_table[hca_type].latest_fw) {
mthca_warn(mdev, "HCA FW version %d.%d.%d is old (%d.%d.%d is current).\n", mthca_warn(mdev, "HCA FW version %d.%d.%d is old (%d.%d.%d is current).\n",
(int) (mdev->fw_ver >> 32), (int) (mdev->fw_ver >> 16) & 0xffff, (int) (mdev->fw_ver >> 32), (int) (mdev->fw_ver >> 16) & 0xffff,
(int) (mdev->fw_ver & 0xffff), (int) (mdev->fw_ver & 0xffff),
(int) (mthca_hca_table[id->driver_data].latest_fw >> 32), (int) (mthca_hca_table[hca_type].latest_fw >> 32),
(int) (mthca_hca_table[id->driver_data].latest_fw >> 16) & 0xffff, (int) (mthca_hca_table[hca_type].latest_fw >> 16) & 0xffff,
(int) (mthca_hca_table[id->driver_data].latest_fw & 0xffff)); (int) (mthca_hca_table[hca_type].latest_fw & 0xffff));
mthca_warn(mdev, "If you have problems, try updating your HCA FW.\n"); mthca_warn(mdev, "If you have problems, try updating your HCA FW.\n");
} }
...@@ -1122,6 +1111,7 @@ static int __devinit mthca_init_one(struct pci_dev *pdev, ...@@ -1122,6 +1111,7 @@ static int __devinit mthca_init_one(struct pci_dev *pdev,
goto err_unregister; goto err_unregister;
pci_set_drvdata(pdev, mdev); pci_set_drvdata(pdev, mdev);
mdev->hca_type = hca_type;
return 0; return 0;
...@@ -1166,7 +1156,7 @@ static int __devinit mthca_init_one(struct pci_dev *pdev, ...@@ -1166,7 +1156,7 @@ static int __devinit mthca_init_one(struct pci_dev *pdev,
return err; return err;
} }
static void __devexit mthca_remove_one(struct pci_dev *pdev) static void __mthca_remove_one(struct pci_dev *pdev)
{ {
struct mthca_dev *mdev = pci_get_drvdata(pdev); struct mthca_dev *mdev = pci_get_drvdata(pdev);
u8 status; u8 status;
...@@ -1211,6 +1201,51 @@ static void __devexit mthca_remove_one(struct pci_dev *pdev) ...@@ -1211,6 +1201,51 @@ static void __devexit mthca_remove_one(struct pci_dev *pdev)
} }
} }
int __mthca_restart_one(struct pci_dev *pdev)
{
struct mthca_dev *mdev;
mdev = pci_get_drvdata(pdev);
if (!mdev)
return -ENODEV;
__mthca_remove_one(pdev);
return __mthca_init_one(pdev, mdev->hca_type);
}
static int __devinit mthca_init_one(struct pci_dev *pdev,
const struct pci_device_id *id)
{
static int mthca_version_printed = 0;
int ret;
mutex_lock(&mthca_device_mutex);
if (!mthca_version_printed) {
printk(KERN_INFO "%s", mthca_version);
++mthca_version_printed;
}
if (id->driver_data >= ARRAY_SIZE(mthca_hca_table)) {
printk(KERN_ERR PFX "%s has invalid driver data %lx\n",
pci_name(pdev), id->driver_data);
mutex_unlock(&mthca_device_mutex);
return -ENODEV;
}
ret = __mthca_init_one(pdev, id->driver_data);
mutex_unlock(&mthca_device_mutex);
return ret;
}
static void __devexit mthca_remove_one(struct pci_dev *pdev)
{
mutex_lock(&mthca_device_mutex);
__mthca_remove_one(pdev);
mutex_unlock(&mthca_device_mutex);
}
static struct pci_device_id mthca_pci_table[] = { static struct pci_device_id mthca_pci_table[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR), { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR),
.driver_data = TAVOR }, .driver_data = TAVOR },
...@@ -1248,13 +1283,24 @@ static int __init mthca_init(void) ...@@ -1248,13 +1283,24 @@ static int __init mthca_init(void)
{ {
int ret; int ret;
mutex_init(&mthca_device_mutex);
ret = mthca_catas_init();
if (ret)
return ret;
ret = pci_register_driver(&mthca_driver); ret = pci_register_driver(&mthca_driver);
return ret < 0 ? ret : 0; if (ret < 0) {
mthca_catas_cleanup();
return ret;
}
return 0;
} }
static void __exit mthca_cleanup(void) static void __exit mthca_cleanup(void)
{ {
pci_unregister_driver(&mthca_driver); pci_unregister_driver(&mthca_driver);
mthca_catas_cleanup();
} }
module_init(mthca_init); module_init(mthca_init);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment