Commit fce0aa08 authored by Sreekanth Reddy's avatar Sreekanth Reddy Committed by Martin K. Petersen

scsi: mpt3sas: Handle CoreDump state from watchdog thread

Watchdog thread polls for IOC state every 1 second.  If it detects that IOC
state is in CoreDump state then it immediately stops the IOs and also
clears the outstanding commands issued to the HBA firmware and then it will
poll for IOC state to be out of CoreDump state and once it detects that IOC
state is changed from CoreDump state to Fault state (or) CoreDumpTOSec
number of seconds are elapsed then it will issue host reset operation and
moves the IOC state to Operational state and resumes the IOs.

Whenever any TM is received from SML then if driver detects the IOC state
is in CoreDump state then it will wait for CoreDump state to be cleared and
will host reset operation.

Link: https://lore.kernel.org/r/20191226111333.26131-6-sreekanth.reddy@broadcom.comSigned-off-by: default avatarSreekanth Reddy <sreekanth.reddy@broadcom.com>
Signed-off-by: default avatarMartin K. Petersen <martin.petersen@oracle.com>
parent e8c2307e
...@@ -128,6 +128,10 @@ _base_wait_on_iocstate(struct MPT3SAS_ADAPTER *ioc, ...@@ -128,6 +128,10 @@ _base_wait_on_iocstate(struct MPT3SAS_ADAPTER *ioc,
u32 ioc_state, int timeout); u32 ioc_state, int timeout);
static int static int
_base_get_ioc_facts(struct MPT3SAS_ADAPTER *ioc); _base_get_ioc_facts(struct MPT3SAS_ADAPTER *ioc);
static void
_base_mask_interrupts(struct MPT3SAS_ADAPTER *ioc);
static void
_base_clear_outstanding_commands(struct MPT3SAS_ADAPTER *ioc);
/** /**
* mpt3sas_base_check_cmd_timeout - Function * mpt3sas_base_check_cmd_timeout - Function
...@@ -612,7 +616,8 @@ _base_fault_reset_work(struct work_struct *work) ...@@ -612,7 +616,8 @@ _base_fault_reset_work(struct work_struct *work)
spin_lock_irqsave(&ioc->ioc_reset_in_progress_lock, flags); spin_lock_irqsave(&ioc->ioc_reset_in_progress_lock, flags);
if (ioc->shost_recovery || ioc->pci_error_recovery) if ((ioc->shost_recovery && (ioc->ioc_coredump_loop == 0)) ||
ioc->pci_error_recovery)
goto rearm_timer; goto rearm_timer;
spin_unlock_irqrestore(&ioc->ioc_reset_in_progress_lock, flags); spin_unlock_irqrestore(&ioc->ioc_reset_in_progress_lock, flags);
...@@ -659,20 +664,64 @@ _base_fault_reset_work(struct work_struct *work) ...@@ -659,20 +664,64 @@ _base_fault_reset_work(struct work_struct *work)
return; /* don't rearm timer */ return; /* don't rearm timer */
} }
ioc->non_operational_loop = 0; if ((doorbell & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_COREDUMP) {
u8 timeout = (ioc->manu_pg11.CoreDumpTOSec) ?
ioc->manu_pg11.CoreDumpTOSec :
MPT3SAS_DEFAULT_COREDUMP_TIMEOUT_SECONDS;
timeout /= (FAULT_POLLING_INTERVAL/1000);
if (ioc->ioc_coredump_loop == 0) {
mpt3sas_base_coredump_info(ioc,
doorbell & MPI2_DOORBELL_DATA_MASK);
/* do not accept any IOs and disable the interrupts */
spin_lock_irqsave(
&ioc->ioc_reset_in_progress_lock, flags);
ioc->shost_recovery = 1;
spin_unlock_irqrestore(
&ioc->ioc_reset_in_progress_lock, flags);
_base_mask_interrupts(ioc);
_base_clear_outstanding_commands(ioc);
}
ioc_info(ioc, "%s: CoreDump loop %d.",
__func__, ioc->ioc_coredump_loop);
/* Wait until CoreDump completes or times out */
if (ioc->ioc_coredump_loop++ < timeout) {
spin_lock_irqsave(
&ioc->ioc_reset_in_progress_lock, flags);
goto rearm_timer;
}
}
if (ioc->ioc_coredump_loop) {
if ((doorbell & MPI2_IOC_STATE_MASK) != MPI2_IOC_STATE_COREDUMP)
ioc_err(ioc, "%s: CoreDump completed. LoopCount: %d",
__func__, ioc->ioc_coredump_loop);
else
ioc_err(ioc, "%s: CoreDump Timed out. LoopCount: %d",
__func__, ioc->ioc_coredump_loop);
ioc->ioc_coredump_loop = MPT3SAS_COREDUMP_LOOP_DONE;
}
ioc->non_operational_loop = 0;
if ((doorbell & MPI2_IOC_STATE_MASK) != MPI2_IOC_STATE_OPERATIONAL) { if ((doorbell & MPI2_IOC_STATE_MASK) != MPI2_IOC_STATE_OPERATIONAL) {
rc = mpt3sas_base_hard_reset_handler(ioc, FORCE_BIG_HAMMER); rc = mpt3sas_base_hard_reset_handler(ioc, FORCE_BIG_HAMMER);
ioc_warn(ioc, "%s: hard reset: %s\n", ioc_warn(ioc, "%s: hard reset: %s\n",
__func__, rc == 0 ? "success" : "failed"); __func__, rc == 0 ? "success" : "failed");
doorbell = mpt3sas_base_get_iocstate(ioc, 0); doorbell = mpt3sas_base_get_iocstate(ioc, 0);
if ((doorbell & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_FAULT) if ((doorbell & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_FAULT) {
mpt3sas_base_fault_info(ioc, doorbell & mpt3sas_base_fault_info(ioc, doorbell &
MPI2_DOORBELL_DATA_MASK); MPI2_DOORBELL_DATA_MASK);
} else if ((doorbell & MPI2_IOC_STATE_MASK) ==
MPI2_IOC_STATE_COREDUMP)
mpt3sas_base_coredump_info(ioc, doorbell &
MPI2_DOORBELL_DATA_MASK);
if (rc && (doorbell & MPI2_IOC_STATE_MASK) != if (rc && (doorbell & MPI2_IOC_STATE_MASK) !=
MPI2_IOC_STATE_OPERATIONAL) MPI2_IOC_STATE_OPERATIONAL)
return; /* don't rearm timer */ return; /* don't rearm timer */
} }
ioc->ioc_coredump_loop = 0;
spin_lock_irqsave(&ioc->ioc_reset_in_progress_lock, flags); spin_lock_irqsave(&ioc->ioc_reset_in_progress_lock, flags);
rearm_timer: rearm_timer:
...@@ -6815,9 +6864,19 @@ _base_make_ioc_ready(struct MPT3SAS_ADAPTER *ioc, enum reset_type type) ...@@ -6815,9 +6864,19 @@ _base_make_ioc_ready(struct MPT3SAS_ADAPTER *ioc, enum reset_type type)
} }
if ((ioc_state & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_COREDUMP) { if ((ioc_state & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_COREDUMP) {
mpt3sas_base_coredump_info(ioc, ioc_state & /*
MPI2_DOORBELL_DATA_MASK); * if host reset is invoked while watch dog thread is waiting
mpt3sas_base_wait_for_coredump_completion(ioc, __func__); * for IOC state to be changed to Fault state then driver has
* to wait here for CoreDump state to clear otherwise reset
* will be issued to the FW and FW move the IOC state to
* reset state without copying the FW logs to coredump region.
*/
if (ioc->ioc_coredump_loop != MPT3SAS_COREDUMP_LOOP_DONE) {
mpt3sas_base_coredump_info(ioc, ioc_state &
MPI2_DOORBELL_DATA_MASK);
mpt3sas_base_wait_for_coredump_completion(ioc,
__func__);
}
goto issue_diag_reset; goto issue_diag_reset;
} }
...@@ -7301,6 +7360,7 @@ mpt3sas_base_attach(struct MPT3SAS_ADAPTER *ioc) ...@@ -7301,6 +7360,7 @@ mpt3sas_base_attach(struct MPT3SAS_ADAPTER *ioc)
sizeof(struct mpt3sas_facts)); sizeof(struct mpt3sas_facts));
ioc->non_operational_loop = 0; ioc->non_operational_loop = 0;
ioc->ioc_coredump_loop = 0;
ioc->got_task_abort_from_ioctl = 0; ioc->got_task_abort_from_ioctl = 0;
return 0; return 0;
...@@ -7591,7 +7651,9 @@ mpt3sas_base_hard_reset_handler(struct MPT3SAS_ADAPTER *ioc, ...@@ -7591,7 +7651,9 @@ mpt3sas_base_hard_reset_handler(struct MPT3SAS_ADAPTER *ioc,
MPT3_DIAG_BUFFER_IS_RELEASED))) { MPT3_DIAG_BUFFER_IS_RELEASED))) {
is_trigger = 1; is_trigger = 1;
ioc_state = mpt3sas_base_get_iocstate(ioc, 0); ioc_state = mpt3sas_base_get_iocstate(ioc, 0);
if ((ioc_state & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_FAULT) if ((ioc_state & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_FAULT ||
(ioc_state & MPI2_IOC_STATE_MASK) ==
MPI2_IOC_STATE_COREDUMP)
is_fault = 1; is_fault = 1;
} }
_base_pre_reset_handler(ioc); _base_pre_reset_handler(ioc);
......
...@@ -92,6 +92,7 @@ ...@@ -92,6 +92,7 @@
/* CoreDump: Default timeout */ /* CoreDump: Default timeout */
#define MPT3SAS_DEFAULT_COREDUMP_TIMEOUT_SECONDS (15) /*15 seconds*/ #define MPT3SAS_DEFAULT_COREDUMP_TIMEOUT_SECONDS (15) /*15 seconds*/
#define MPT3SAS_COREDUMP_LOOP_DONE (0xFF)
/* /*
* Set MPT3SAS_SG_DEPTH value based on user input. * Set MPT3SAS_SG_DEPTH value based on user input.
...@@ -1054,6 +1055,7 @@ typedef void (*MPT3SAS_FLUSH_RUNNING_CMDS)(struct MPT3SAS_ADAPTER *ioc); ...@@ -1054,6 +1055,7 @@ typedef void (*MPT3SAS_FLUSH_RUNNING_CMDS)(struct MPT3SAS_ADAPTER *ioc);
* @cpu_msix_table: table for mapping cpus to msix index * @cpu_msix_table: table for mapping cpus to msix index
* @cpu_msix_table_sz: table size * @cpu_msix_table_sz: table size
* @total_io_cnt: Gives total IO count, used to load balance the interrupts * @total_io_cnt: Gives total IO count, used to load balance the interrupts
* @ioc_coredump_loop: will have non-zero value when FW is in CoreDump state
* @high_iops_outstanding: used to load balance the interrupts * @high_iops_outstanding: used to load balance the interrupts
* within high iops reply queues * within high iops reply queues
* @msix_load_balance: Enables load balancing of interrupts across * @msix_load_balance: Enables load balancing of interrupts across
...@@ -1244,6 +1246,7 @@ struct MPT3SAS_ADAPTER { ...@@ -1244,6 +1246,7 @@ struct MPT3SAS_ADAPTER {
u32 ioc_reset_count; u32 ioc_reset_count;
MPT3SAS_FLUSH_RUNNING_CMDS schedule_dead_ioc_flush_running_cmds; MPT3SAS_FLUSH_RUNNING_CMDS schedule_dead_ioc_flush_running_cmds;
u32 non_operational_loop; u32 non_operational_loop;
u8 ioc_coredump_loop;
atomic64_t total_io_cnt; atomic64_t total_io_cnt;
atomic64_t high_iops_outstanding; atomic64_t high_iops_outstanding;
bool msix_load_balance; bool msix_load_balance;
......
...@@ -2749,6 +2749,12 @@ mpt3sas_scsih_issue_tm(struct MPT3SAS_ADAPTER *ioc, u16 handle, u64 lun, ...@@ -2749,6 +2749,12 @@ mpt3sas_scsih_issue_tm(struct MPT3SAS_ADAPTER *ioc, u16 handle, u64 lun,
MPI2_DOORBELL_DATA_MASK); MPI2_DOORBELL_DATA_MASK);
rc = mpt3sas_base_hard_reset_handler(ioc, FORCE_BIG_HAMMER); rc = mpt3sas_base_hard_reset_handler(ioc, FORCE_BIG_HAMMER);
return (!rc) ? SUCCESS : FAILED; return (!rc) ? SUCCESS : FAILED;
} else if ((ioc_state & MPI2_IOC_STATE_MASK) ==
MPI2_IOC_STATE_COREDUMP) {
mpt3sas_base_coredump_info(ioc, ioc_state &
MPI2_DOORBELL_DATA_MASK);
rc = mpt3sas_base_hard_reset_handler(ioc, FORCE_BIG_HAMMER);
return (!rc) ? SUCCESS : FAILED;
} }
smid = mpt3sas_base_get_smid_hpr(ioc, ioc->tm_cb_idx); smid = mpt3sas_base_get_smid_hpr(ioc, ioc->tm_cb_idx);
...@@ -4525,6 +4531,7 @@ static void ...@@ -4525,6 +4531,7 @@ static void
_scsih_temp_threshold_events(struct MPT3SAS_ADAPTER *ioc, _scsih_temp_threshold_events(struct MPT3SAS_ADAPTER *ioc,
Mpi2EventDataTemperature_t *event_data) Mpi2EventDataTemperature_t *event_data)
{ {
u32 doorbell;
if (ioc->temp_sensors_count >= event_data->SensorNum) { if (ioc->temp_sensors_count >= event_data->SensorNum) {
ioc_err(ioc, "Temperature Threshold flags %s%s%s%s exceeded for Sensor: %d !!!\n", ioc_err(ioc, "Temperature Threshold flags %s%s%s%s exceeded for Sensor: %d !!!\n",
le16_to_cpu(event_data->Status) & 0x1 ? "0 " : " ", le16_to_cpu(event_data->Status) & 0x1 ? "0 " : " ",
...@@ -4534,6 +4541,18 @@ _scsih_temp_threshold_events(struct MPT3SAS_ADAPTER *ioc, ...@@ -4534,6 +4541,18 @@ _scsih_temp_threshold_events(struct MPT3SAS_ADAPTER *ioc,
event_data->SensorNum); event_data->SensorNum);
ioc_err(ioc, "Current Temp In Celsius: %d\n", ioc_err(ioc, "Current Temp In Celsius: %d\n",
event_data->CurrentTemperature); event_data->CurrentTemperature);
if (ioc->hba_mpi_version_belonged != MPI2_VERSION) {
doorbell = mpt3sas_base_get_iocstate(ioc, 0);
if ((doorbell & MPI2_IOC_STATE_MASK) ==
MPI2_IOC_STATE_FAULT) {
mpt3sas_base_fault_info(ioc,
doorbell & MPI2_DOORBELL_DATA_MASK);
} else if ((doorbell & MPI2_IOC_STATE_MASK) ==
MPI2_IOC_STATE_COREDUMP) {
mpt3sas_base_coredump_info(ioc,
doorbell & MPI2_DOORBELL_DATA_MASK);
}
}
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment