Commit 66446820 authored by Oded Gabbay's avatar Oded Gabbay

habanalabs: GAUDI does not support soft-reset

GAUDI does not support soft-reset as it leaves the NIC ports in an awkward
state, where their QMANs were reset but the NIC itself is still working.

In addition, there is not much sense in doing soft-reset when training is
done on multiple GAUDIs.
Signed-off-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: default avatarTomer Tayar <ttayar@habana.ai>
parent d7985079
...@@ -801,6 +801,7 @@ static void device_hard_reset_pending(struct work_struct *work) ...@@ -801,6 +801,7 @@ static void device_hard_reset_pending(struct work_struct *work)
* @hdev: pointer to habanalabs device structure * @hdev: pointer to habanalabs device structure
* @hard_reset: should we do hard reset to all engines or just reset the * @hard_reset: should we do hard reset to all engines or just reset the
* compute/dma engines * compute/dma engines
* @from_hard_reset_thread: is the caller the hard-reset thread
* *
* Block future CS and wait for pending CS to be enqueued * Block future CS and wait for pending CS to be enqueued
* Call ASIC H/W fini * Call ASIC H/W fini
...@@ -823,6 +824,11 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset, ...@@ -823,6 +824,11 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
return 0; return 0;
} }
if ((!hard_reset) && (!hdev->supports_soft_reset)) {
dev_dbg(hdev->dev, "Doing hard-reset instead of soft-reset\n");
hard_reset = true;
}
/* /*
* Prevent concurrency in this function - only one reset should be * Prevent concurrency in this function - only one reset should be
* done at any given time. Only need to perform this if we didn't * done at any given time. Only need to perform this if we didn't
......
...@@ -5774,7 +5774,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, ...@@ -5774,7 +5774,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
u16 event_type = ((ctl & EQ_CTL_EVENT_TYPE_MASK) u16 event_type = ((ctl & EQ_CTL_EVENT_TYPE_MASK)
>> EQ_CTL_EVENT_TYPE_SHIFT); >> EQ_CTL_EVENT_TYPE_SHIFT);
u8 cause; u8 cause;
bool soft_reset_required; bool reset_required;
gaudi->events_stat[event_type]++; gaudi->events_stat[event_type]++;
gaudi->events_stat_aggregate[event_type]++; gaudi->events_stat_aggregate[event_type]++;
...@@ -5840,16 +5840,18 @@ static void gaudi_handle_eqe(struct hl_device *hdev, ...@@ -5840,16 +5840,18 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
case GAUDI_EVENT_TPC6_DEC: case GAUDI_EVENT_TPC6_DEC:
case GAUDI_EVENT_TPC7_DEC: case GAUDI_EVENT_TPC7_DEC:
gaudi_print_irq_info(hdev, event_type, true); gaudi_print_irq_info(hdev, event_type, true);
soft_reset_required = gaudi_tpc_read_interrupts(hdev, reset_required = gaudi_tpc_read_interrupts(hdev,
tpc_dec_event_to_tpc_id(event_type), tpc_dec_event_to_tpc_id(event_type),
"AXI_SLV_DEC_Error"); "AXI_SLV_DEC_Error");
if (soft_reset_required) { if (reset_required) {
dev_err_ratelimited(hdev->dev, dev_err(hdev->dev, "hard reset required due to %s\n",
"soft reset required due to %s\n", gaudi_irq_map_table[event_type].name);
gaudi_irq_map_table[event_type].name);
hl_device_reset(hdev, false, false); if (hdev->hard_reset_on_fw_events)
hl_device_reset(hdev, true, false);
} else {
hl_fw_unmask_irq(hdev, event_type);
} }
hl_fw_unmask_irq(hdev, event_type);
break; break;
case GAUDI_EVENT_TPC0_KRN_ERR: case GAUDI_EVENT_TPC0_KRN_ERR:
...@@ -5861,16 +5863,18 @@ static void gaudi_handle_eqe(struct hl_device *hdev, ...@@ -5861,16 +5863,18 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
case GAUDI_EVENT_TPC6_KRN_ERR: case GAUDI_EVENT_TPC6_KRN_ERR:
case GAUDI_EVENT_TPC7_KRN_ERR: case GAUDI_EVENT_TPC7_KRN_ERR:
gaudi_print_irq_info(hdev, event_type, true); gaudi_print_irq_info(hdev, event_type, true);
soft_reset_required = gaudi_tpc_read_interrupts(hdev, reset_required = gaudi_tpc_read_interrupts(hdev,
tpc_krn_event_to_tpc_id(event_type), tpc_krn_event_to_tpc_id(event_type),
"KRN_ERR"); "KRN_ERR");
if (soft_reset_required) { if (reset_required) {
dev_err_ratelimited(hdev->dev, dev_err(hdev->dev, "hard reset required due to %s\n",
"soft reset required due to %s\n", gaudi_irq_map_table[event_type].name);
gaudi_irq_map_table[event_type].name);
hl_device_reset(hdev, false, false); if (hdev->hard_reset_on_fw_events)
hl_device_reset(hdev, true, false);
} else {
hl_fw_unmask_irq(hdev, event_type);
} }
hl_fw_unmask_irq(hdev, event_type);
break; break;
case GAUDI_EVENT_PCIE_CORE_SERR: case GAUDI_EVENT_PCIE_CORE_SERR:
...@@ -5921,8 +5925,8 @@ static void gaudi_handle_eqe(struct hl_device *hdev, ...@@ -5921,8 +5925,8 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
case GAUDI_EVENT_RAZWI_OR_ADC_SW: case GAUDI_EVENT_RAZWI_OR_ADC_SW:
gaudi_print_irq_info(hdev, event_type, true); gaudi_print_irq_info(hdev, event_type, true);
hl_device_reset(hdev, false, false); if (hdev->hard_reset_on_fw_events)
hl_fw_unmask_irq(hdev, event_type); hl_device_reset(hdev, true, false);
break; break;
case GAUDI_EVENT_TPC0_BMON_SPMU: case GAUDI_EVENT_TPC0_BMON_SPMU:
......
...@@ -752,6 +752,7 @@ static int goya_sw_init(struct hl_device *hdev) ...@@ -752,6 +752,7 @@ static int goya_sw_init(struct hl_device *hdev)
spin_lock_init(&goya->hw_queues_lock); spin_lock_init(&goya->hw_queues_lock);
hdev->supports_coresight = true; hdev->supports_coresight = true;
hdev->supports_soft_reset = true;
return 0; return 0;
......
...@@ -1436,6 +1436,7 @@ struct hl_device_idle_busy_ts { ...@@ -1436,6 +1436,7 @@ struct hl_device_idle_busy_ts {
* @stop_on_err: true if engines should stop on error. * @stop_on_err: true if engines should stop on error.
* @supports_sync_stream: is sync stream supported. * @supports_sync_stream: is sync stream supported.
* @supports_coresight: is CoreSight supported. * @supports_coresight: is CoreSight supported.
* @supports_soft_reset: is soft reset supported.
*/ */
struct hl_device { struct hl_device {
struct pci_dev *pdev; struct pci_dev *pdev;
...@@ -1522,6 +1523,7 @@ struct hl_device { ...@@ -1522,6 +1523,7 @@ struct hl_device {
u8 stop_on_err; u8 stop_on_err;
u8 supports_sync_stream; u8 supports_sync_stream;
u8 supports_coresight; u8 supports_coresight;
u8 supports_soft_reset;
/* Parameters for bring-up */ /* Parameters for bring-up */
u8 mmu_enable; u8 mmu_enable;
......
...@@ -183,6 +183,11 @@ static ssize_t soft_reset_store(struct device *dev, ...@@ -183,6 +183,11 @@ static ssize_t soft_reset_store(struct device *dev,
goto out; goto out;
} }
if (!hdev->supports_soft_reset) {
dev_err(hdev->dev, "Device does not support soft-reset\n");
goto out;
}
dev_warn(hdev->dev, "Soft-Reset requested through sysfs\n"); dev_warn(hdev->dev, "Soft-Reset requested through sysfs\n");
hl_device_reset(hdev, false, false); hl_device_reset(hdev, false, false);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment