Commit edb07cb6 authored by Ofir Bitton's avatar Ofir Bitton Committed by Oded Gabbay

habanalabs: read device boot errors after cpucp is up

Boot cpu can report errors in various boot stages.
Current implementaion does not take into consideration errors
reported in late stages, hence we will check for errors at the most
late stage when fetching cpucp information.
Signed-off-by: default avatarOfir Bitton <obitton@habana.ai>
Reviewed-by: default avatarOded Gabbay <ogabbay@kernel.org>
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent 6769cea8
...@@ -279,8 +279,68 @@ int hl_fw_send_heartbeat(struct hl_device *hdev) ...@@ -279,8 +279,68 @@ int hl_fw_send_heartbeat(struct hl_device *hdev)
return rc; return rc;
} }
static int fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg,
u32 cpu_security_boot_status_reg)
{
u32 err_val, security_val;
/* Some of the firmware status codes are deprecated in newer f/w
* versions. In those versions, the errors are reported
* in different registers. Therefore, we need to check those
* registers and print the exact errors. Moreover, there
* may be multiple errors, so we need to report on each error
* separately. Some of the error codes might indicate a state
* that is not an error per-se, but it is an error in production
* environment
*/
err_val = RREG32(boot_err0_reg);
if (!(err_val & CPU_BOOT_ERR0_ENABLED))
return 0;
if (err_val & CPU_BOOT_ERR0_DRAM_INIT_FAIL)
dev_err(hdev->dev,
"Device boot error - DRAM initialization failed\n");
if (err_val & CPU_BOOT_ERR0_FIT_CORRUPTED)
dev_err(hdev->dev, "Device boot error - FIT image corrupted\n");
if (err_val & CPU_BOOT_ERR0_TS_INIT_FAIL)
dev_err(hdev->dev,
"Device boot error - Thermal Sensor initialization failed\n");
if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED)
dev_warn(hdev->dev,
"Device boot warning - Skipped DRAM initialization\n");
if (err_val & CPU_BOOT_ERR0_BMC_WAIT_SKIPPED)
dev_warn(hdev->dev,
"Device boot error - Skipped waiting for BMC\n");
if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY)
dev_err(hdev->dev,
"Device boot error - Serdes data from BMC not available\n");
if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL)
dev_err(hdev->dev,
"Device boot error - NIC F/W initialization failed\n");
if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY)
dev_warn(hdev->dev,
"Device boot warning - security not ready\n");
if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL)
dev_err(hdev->dev, "Device boot error - security failure\n");
if (err_val & CPU_BOOT_ERR0_EFUSE_FAIL)
dev_err(hdev->dev, "Device boot error - eFuse failure\n");
if (err_val & CPU_BOOT_ERR0_PLL_FAIL)
dev_err(hdev->dev, "Device boot error - PLL failure\n");
security_val = RREG32(cpu_security_boot_status_reg);
if (security_val & CPU_BOOT_DEV_STS0_ENABLED)
dev_dbg(hdev->dev, "Device security status %#x\n",
security_val);
if (err_val & ~CPU_BOOT_ERR0_ENABLED)
return -EIO;
return 0;
}
int hl_fw_cpucp_info_get(struct hl_device *hdev, int hl_fw_cpucp_info_get(struct hl_device *hdev,
u32 cpu_security_boot_status_reg) u32 cpu_security_boot_status_reg,
u32 boot_err0_reg)
{ {
struct asic_fixed_properties *prop = &hdev->asic_prop; struct asic_fixed_properties *prop = &hdev->asic_prop;
struct cpucp_packet pkt = {}; struct cpucp_packet pkt = {};
...@@ -314,6 +374,12 @@ int hl_fw_cpucp_info_get(struct hl_device *hdev, ...@@ -314,6 +374,12 @@ int hl_fw_cpucp_info_get(struct hl_device *hdev,
goto out; goto out;
} }
rc = fw_read_errors(hdev, boot_err0_reg, cpu_security_boot_status_reg);
if (rc) {
dev_err(hdev->dev, "Errors in device boot\n");
goto out;
}
memcpy(&prop->cpucp_info, cpucp_info_cpu_addr, memcpy(&prop->cpucp_info, cpucp_info_cpu_addr,
sizeof(prop->cpucp_info)); sizeof(prop->cpucp_info));
...@@ -483,58 +549,6 @@ int hl_fw_cpucp_pll_info_get(struct hl_device *hdev, u16 pll_index, ...@@ -483,58 +549,6 @@ int hl_fw_cpucp_pll_info_get(struct hl_device *hdev, u16 pll_index,
return rc; return rc;
} }
static void fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg,
u32 cpu_security_boot_status_reg)
{
u32 err_val, security_val;
/* Some of the firmware status codes are deprecated in newer f/w
* versions. In those versions, the errors are reported
* in different registers. Therefore, we need to check those
* registers and print the exact errors. Moreover, there
* may be multiple errors, so we need to report on each error
* separately. Some of the error codes might indicate a state
* that is not an error per-se, but it is an error in production
* environment
*/
err_val = RREG32(boot_err0_reg);
if (!(err_val & CPU_BOOT_ERR0_ENABLED))
return;
if (err_val & CPU_BOOT_ERR0_DRAM_INIT_FAIL)
dev_err(hdev->dev,
"Device boot error - DRAM initialization failed\n");
if (err_val & CPU_BOOT_ERR0_FIT_CORRUPTED)
dev_err(hdev->dev, "Device boot error - FIT image corrupted\n");
if (err_val & CPU_BOOT_ERR0_TS_INIT_FAIL)
dev_err(hdev->dev,
"Device boot error - Thermal Sensor initialization failed\n");
if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED)
dev_warn(hdev->dev,
"Device boot warning - Skipped DRAM initialization\n");
if (err_val & CPU_BOOT_ERR0_BMC_WAIT_SKIPPED)
dev_warn(hdev->dev,
"Device boot error - Skipped waiting for BMC\n");
if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY)
dev_err(hdev->dev,
"Device boot error - Serdes data from BMC not available\n");
if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL)
dev_err(hdev->dev,
"Device boot error - NIC F/W initialization failed\n");
if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY)
dev_warn(hdev->dev,
"Device boot warning - security not ready\n");
if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL)
dev_err(hdev->dev, "Device boot error - security failure\n");
if (err_val & CPU_BOOT_ERR0_EFUSE_FAIL)
dev_err(hdev->dev, "Device boot error - eFuse failure\n");
security_val = RREG32(cpu_security_boot_status_reg);
if (security_val & CPU_BOOT_DEV_STS0_ENABLED)
dev_dbg(hdev->dev, "Device security status %#x\n",
security_val);
}
static void detect_cpu_boot_status(struct hl_device *hdev, u32 status) static void detect_cpu_boot_status(struct hl_device *hdev, u32 status)
{ {
/* Some of the status codes below are deprecated in newer f/w /* Some of the status codes below are deprecated in newer f/w
...@@ -833,6 +847,10 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg, ...@@ -833,6 +847,10 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
goto out; goto out;
} }
rc = fw_read_errors(hdev, boot_err0_reg, cpu_security_boot_status_reg);
if (rc)
return rc;
/* Clear reset status since we need to read again from app */ /* Clear reset status since we need to read again from app */
prop->hard_reset_done_by_fw = false; prop->hard_reset_done_by_fw = false;
...@@ -855,6 +873,8 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg, ...@@ -855,6 +873,8 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
dev_info(hdev->dev, "Successfully loaded firmware to device\n"); dev_info(hdev->dev, "Successfully loaded firmware to device\n");
return 0;
out: out:
fw_read_errors(hdev, boot_err0_reg, cpu_security_boot_status_reg); fw_read_errors(hdev, boot_err0_reg, cpu_security_boot_status_reg);
......
...@@ -2209,7 +2209,8 @@ void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size, ...@@ -2209,7 +2209,8 @@ void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
void *vaddr); void *vaddr);
int hl_fw_send_heartbeat(struct hl_device *hdev); int hl_fw_send_heartbeat(struct hl_device *hdev);
int hl_fw_cpucp_info_get(struct hl_device *hdev, int hl_fw_cpucp_info_get(struct hl_device *hdev,
u32 cpu_security_boot_status_reg); u32 cpu_security_boot_status_reg,
u32 boot_err0_reg);
int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size); int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size);
int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev, int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
struct hl_info_pci_counters *counters); struct hl_info_pci_counters *counters);
......
...@@ -7451,7 +7451,7 @@ static int gaudi_cpucp_info_get(struct hl_device *hdev) ...@@ -7451,7 +7451,7 @@ static int gaudi_cpucp_info_get(struct hl_device *hdev)
if (!(gaudi->hw_cap_initialized & HW_CAP_CPU_Q)) if (!(gaudi->hw_cap_initialized & HW_CAP_CPU_Q))
return 0; return 0;
rc = hl_fw_cpucp_info_get(hdev, mmCPU_BOOT_DEV_STS0); rc = hl_fw_cpucp_info_get(hdev, mmCPU_BOOT_DEV_STS0, mmCPU_BOOT_ERR0);
if (rc) if (rc)
return rc; return rc;
......
...@@ -5150,7 +5150,7 @@ int goya_cpucp_info_get(struct hl_device *hdev) ...@@ -5150,7 +5150,7 @@ int goya_cpucp_info_get(struct hl_device *hdev)
if (!(goya->hw_cap_initialized & HW_CAP_CPU_Q)) if (!(goya->hw_cap_initialized & HW_CAP_CPU_Q))
return 0; return 0;
rc = hl_fw_cpucp_info_get(hdev, mmCPU_BOOT_DEV_STS0); rc = hl_fw_cpucp_info_get(hdev, mmCPU_BOOT_DEV_STS0, mmCPU_BOOT_ERR0);
if (rc) if (rc)
return rc; return rc;
......
...@@ -69,6 +69,8 @@ ...@@ -69,6 +69,8 @@
* image has failed to match expected * image has failed to match expected
* checksum. Trying to program image again * checksum. Trying to program image again
* might solve this. * might solve this.
* CPU_BOOT_ERR0_PLL_FAIL PLL settings failed, meaning that one
* of the PLLs remained in REF_CLK
* *
* CPU_BOOT_ERR0_ENABLED Error registers enabled. * CPU_BOOT_ERR0_ENABLED Error registers enabled.
* This is a main indication that the * This is a main indication that the
...@@ -88,6 +90,7 @@ ...@@ -88,6 +90,7 @@
#define CPU_BOOT_ERR0_EFUSE_FAIL (1 << 9) #define CPU_BOOT_ERR0_EFUSE_FAIL (1 << 9)
#define CPU_BOOT_ERR0_PRI_IMG_VER_FAIL (1 << 10) #define CPU_BOOT_ERR0_PRI_IMG_VER_FAIL (1 << 10)
#define CPU_BOOT_ERR0_SEC_IMG_VER_FAIL (1 << 11) #define CPU_BOOT_ERR0_SEC_IMG_VER_FAIL (1 << 11)
#define CPU_BOOT_ERR0_PLL_FAIL (1 << 12)
#define CPU_BOOT_ERR0_ENABLED (1 << 31) #define CPU_BOOT_ERR0_ENABLED (1 << 31)
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment