Commit fb9b0457 authored by Kashyap Desai's avatar Kashyap Desai Committed by Martin K. Petersen

scsi: mpi3mr: Add support for recovering controller

Detection of firmware fault or any kind of unresponsiveness in the
controller (any admin command which times out) results in resetting the
controller. The primary reset mechanisms used are either soft reset or diag
fault reset. A reset is performed if the host sets the ResetAction field in
the HostDiagnostic register to either 001b (soft reset) or 007b (diag fault
reset). After successfully resetting the controller the driver
reinitializes the controller by going through start of the day
initialization procedure. Pending I/Os during the reset are returned back
to the SCSI midlayer for retry.

Link: https://lore.kernel.org/r/20210520152545.2710479-10-kashyap.desai@broadcom.com
Cc: sathya.prakash@broadcom.co
Reviewed-by: default avatarHannes Reinecke <hare@suse.de>
Reviewed-by: default avatarTomas Henzl <thenzl@redhat.com>
Reviewed-by: default avatarHimanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: default avatarKashyap Desai <kashyap.desai@broadcom.com>
Signed-off-by: default avatarMartin K. Petersen <martin.petersen@oracle.com>
parent e36710dc
......@@ -103,6 +103,7 @@ extern struct list_head mrioc_list;
#define MPI3MR_INTADMCMD_TIMEOUT 10
#define MPI3MR_PORTENABLE_TIMEOUT 300
#define MPI3MR_RESETTM_TIMEOUT 30
#define MPI3MR_RESET_HOST_IOWAIT_TIMEOUT 5
#define MPI3MR_DEFAULT_SHUTDOWN_TIME 120
#define MPI3MR_WATCHDOG_INTERVAL 1000 /* in milli seconds */
......@@ -635,10 +636,14 @@ struct scmd_priv {
* @dev_handle_bitmap_sz: Device handle bitmap size
* @removepend_bitmap: Remove pending bitmap
* @delayed_rmhs_list: Delayed device removal list
* @fault_dbg: Fault debug flag
* @reset_in_progress: Reset in progress flag
* @unrecoverable: Controller unrecoverable flag
* @reset_mutex: Controller reset mutex
* @reset_waitq: Controller reset wait queue
* @diagsave_timeout: Diagnostic information save timeout
* @logging_level: Controller debug logging level
* @flush_io_count: I/O count to flush after reset
* @current_event: Firmware event currently in process
* @driver_info: Driver, Kernel, OS information to firmware
* @change_count: Topology change count
......@@ -753,11 +758,15 @@ struct mpi3mr_ioc {
void *removepend_bitmap;
struct list_head delayed_rmhs_list;
u8 fault_dbg;
u8 reset_in_progress;
u8 unrecoverable;
struct mutex reset_mutex;
wait_queue_head_t reset_waitq;
u16 diagsave_timeout;
int logging_level;
u16 flush_io_count;
struct mpi3mr_fwevt *current_event;
struct mpi3_driver_info_layout driver_info;
......@@ -806,8 +815,8 @@ struct delayed_dev_rmhs_node {
int mpi3mr_setup_resources(struct mpi3mr_ioc *mrioc);
void mpi3mr_cleanup_resources(struct mpi3mr_ioc *mrioc);
int mpi3mr_init_ioc(struct mpi3mr_ioc *mrioc);
void mpi3mr_cleanup_ioc(struct mpi3mr_ioc *mrioc);
int mpi3mr_init_ioc(struct mpi3mr_ioc *mrioc, u8 re_init);
void mpi3mr_cleanup_ioc(struct mpi3mr_ioc *mrioc, u8 re_init);
int mpi3mr_issue_port_enable(struct mpi3mr_ioc *mrioc, u8 async);
int mpi3mr_admin_request_post(struct mpi3mr_ioc *mrioc, void *admin_req,
u16 admin_req_sz, u8 ignore_reset);
......@@ -833,6 +842,8 @@ void mpi3mr_stop_watchdog(struct mpi3mr_ioc *mrioc);
int mpi3mr_soft_reset_handler(struct mpi3mr_ioc *mrioc,
u32 reset_reason, u8 snapdump);
int mpi3mr_diagfault_reset_handler(struct mpi3mr_ioc *mrioc,
u32 reset_reason);
void mpi3mr_ioc_disable_intr(struct mpi3mr_ioc *mrioc);
void mpi3mr_ioc_enable_intr(struct mpi3mr_ioc *mrioc);
......
This diff is collapsed.
......@@ -310,6 +310,86 @@ void mpi3mr_cleanup_fwevt_list(struct mpi3mr_ioc *mrioc)
}
}
/**
* mpi3mr_invalidate_devhandles -Invalidate device handles
* @mrioc: Adapter instance reference
*
* Invalidate the device handles in the target device structures
* . Called post reset prior to reinitializing the controller.
*
* Return: Nothing.
*/
void mpi3mr_invalidate_devhandles(struct mpi3mr_ioc *mrioc)
{
struct mpi3mr_tgt_dev *tgtdev;
struct mpi3mr_stgt_priv_data *tgt_priv;
list_for_each_entry(tgtdev, &mrioc->tgtdev_list, list) {
tgtdev->dev_handle = MPI3MR_INVALID_DEV_HANDLE;
if (tgtdev->starget && tgtdev->starget->hostdata) {
tgt_priv = tgtdev->starget->hostdata;
tgt_priv->dev_handle = MPI3MR_INVALID_DEV_HANDLE;
}
}
}
/**
* mpi3mr_flush_scmd - Flush individual SCSI command
* @rq: Block request
* @data: Adapter instance reference
*
* Return the SCSI command to the upper layers if it is in LLD
* scope.
*
* Return: true always.
*/
static bool mpi3mr_flush_scmd(struct request *rq,
void *data, bool reserved)
{
struct mpi3mr_ioc *mrioc = (struct mpi3mr_ioc *)data;
struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq);
struct scmd_priv *priv = NULL;
if (scmd) {
priv = scsi_cmd_priv(scmd);
if (!priv->in_lld_scope)
goto out;
mpi3mr_clear_scmd_priv(mrioc, scmd);
scsi_dma_unmap(scmd);
scmd->result = DID_RESET << 16;
scsi_print_command(scmd);
scmd->scsi_done(scmd);
mrioc->flush_io_count++;
}
out:
return(true);
}
/**
* mpi3mr_flush_host_io - Flush host I/Os
* @mrioc: Adapter instance reference
*
* Flush all of the pending I/Os by calling
* blk_mq_tagset_busy_iter() for each possible tag. This is
* executed post controller reset
*
* Return: Nothing.
*/
void mpi3mr_flush_host_io(struct mpi3mr_ioc *mrioc)
{
struct Scsi_Host *shost = mrioc->shost;
mrioc->flush_io_count = 0;
ioc_info(mrioc, "%s :Flushing Host I/O cmds post reset\n", __func__);
blk_mq_tagset_busy_iter(&shost->tag_set,
mpi3mr_flush_scmd, (void *)mrioc);
ioc_info(mrioc, "%s :Flushed %d Host I/O cmds\n", __func__,
mrioc->flush_io_count);
}
/**
* mpi3mr_alloc_tgtdev - target device allocator
*
......@@ -2495,6 +2575,7 @@ mpi3mr_probe(struct pci_dev *pdev, const struct pci_device_id *id)
INIT_LIST_HEAD(&mrioc->tgtdev_list);
INIT_LIST_HEAD(&mrioc->delayed_rmhs_list);
mutex_init(&mrioc->reset_mutex);
mpi3mr_init_drv_cmd(&mrioc->init_cmds, MPI3MR_HOSTTAG_INITCMDS);
for (i = 0; i < MPI3MR_NUM_DEVRMCMD; i++)
......@@ -2504,6 +2585,7 @@ mpi3mr_probe(struct pci_dev *pdev, const struct pci_device_id *id)
if (pdev->revision)
mrioc->enable_segqueue = true;
init_waitqueue_head(&mrioc->reset_waitq);
mrioc->logging_level = logging_level;
mrioc->shost = shost;
mrioc->pdev = pdev;
......@@ -2528,7 +2610,7 @@ mpi3mr_probe(struct pci_dev *pdev, const struct pci_device_id *id)
}
mrioc->is_driver_loading = 1;
if (mpi3mr_init_ioc(mrioc)) {
if (mpi3mr_init_ioc(mrioc, 0)) {
ioc_err(mrioc, "failure at %s:%d/%s()!\n",
__FILE__, __LINE__, __func__);
retval = -ENODEV;
......@@ -2551,7 +2633,7 @@ mpi3mr_probe(struct pci_dev *pdev, const struct pci_device_id *id)
return retval;
addhost_failed:
mpi3mr_cleanup_ioc(mrioc);
mpi3mr_cleanup_ioc(mrioc, 0);
out_iocinit_failed:
destroy_workqueue(mrioc->fwevt_worker_thread);
out_fwevtthread_failed:
......@@ -2600,7 +2682,7 @@ static void mpi3mr_remove(struct pci_dev *pdev)
mpi3mr_tgtdev_del_from_list(mrioc, tgtdev);
mpi3mr_tgtdev_put(tgtdev);
}
mpi3mr_cleanup_ioc(mrioc);
mpi3mr_cleanup_ioc(mrioc, 0);
spin_lock(&mrioc_list_lock);
list_del(&mrioc->list);
......@@ -2640,7 +2722,7 @@ static void mpi3mr_shutdown(struct pci_dev *pdev)
spin_unlock_irqrestore(&mrioc->fwevt_lock, flags);
if (wq)
destroy_workqueue(wq);
mpi3mr_cleanup_ioc(mrioc);
mpi3mr_cleanup_ioc(mrioc, 0);
}
static const struct pci_device_id mpi3mr_pci_id_table[] = {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment