Commit b575a767 authored by Oded Gabbay's avatar Oded Gabbay

habanalabs: print f/w boot unknown error

We need to print a message to the kernel log in case we encounter
an unknown error in the f/w boot to help the user understand what
happened.

In addition, we shouldn't print unknown error in case of known errors.

Moreover, in case of warnings/info, we shouldn't return -EIO that will
fail the initialization and mark the device as disabled
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent 669b0188
...@@ -293,6 +293,7 @@ static int fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg, ...@@ -293,6 +293,7 @@ static int fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg,
u32 cpu_security_boot_status_reg) u32 cpu_security_boot_status_reg)
{ {
u32 err_val, security_val; u32 err_val, security_val;
bool err_exists = false;
/* Some of the firmware status codes are deprecated in newer f/w /* Some of the firmware status codes are deprecated in newer f/w
* versions. In those versions, the errors are reported * versions. In those versions, the errors are reported
...@@ -307,51 +308,102 @@ static int fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg, ...@@ -307,51 +308,102 @@ static int fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg,
if (!(err_val & CPU_BOOT_ERR0_ENABLED)) if (!(err_val & CPU_BOOT_ERR0_ENABLED))
return 0; return 0;
if (err_val & CPU_BOOT_ERR0_DRAM_INIT_FAIL) if (err_val & CPU_BOOT_ERR0_DRAM_INIT_FAIL) {
dev_err(hdev->dev, dev_err(hdev->dev,
"Device boot error - DRAM initialization failed\n"); "Device boot error - DRAM initialization failed\n");
if (err_val & CPU_BOOT_ERR0_FIT_CORRUPTED) err_exists = true;
}
if (err_val & CPU_BOOT_ERR0_FIT_CORRUPTED) {
dev_err(hdev->dev, "Device boot error - FIT image corrupted\n"); dev_err(hdev->dev, "Device boot error - FIT image corrupted\n");
if (err_val & CPU_BOOT_ERR0_TS_INIT_FAIL) err_exists = true;
}
if (err_val & CPU_BOOT_ERR0_TS_INIT_FAIL) {
dev_err(hdev->dev, dev_err(hdev->dev,
"Device boot error - Thermal Sensor initialization failed\n"); "Device boot error - Thermal Sensor initialization failed\n");
if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) err_exists = true;
}
if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) {
dev_warn(hdev->dev, dev_warn(hdev->dev,
"Device boot warning - Skipped DRAM initialization\n"); "Device boot warning - Skipped DRAM initialization\n");
/* This is a warning so we don't want it to disable the
* device
*/
err_val &= ~CPU_BOOT_ERR0_DRAM_SKIPPED;
}
if (err_val & CPU_BOOT_ERR0_BMC_WAIT_SKIPPED) { if (err_val & CPU_BOOT_ERR0_BMC_WAIT_SKIPPED) {
if (hdev->bmc_enable) if (hdev->bmc_enable) {
dev_warn(hdev->dev, dev_err(hdev->dev,
"Device boot error - Skipped waiting for BMC\n"); "Device boot error - Skipped waiting for BMC\n");
else err_exists = true;
} else {
dev_info(hdev->dev,
"Device boot message - Skipped waiting for BMC\n");
/* This is an info so we don't want it to disable the
* device
*/
err_val &= ~CPU_BOOT_ERR0_BMC_WAIT_SKIPPED; err_val &= ~CPU_BOOT_ERR0_BMC_WAIT_SKIPPED;
}
} }
if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY) if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY) {
dev_err(hdev->dev, dev_err(hdev->dev,
"Device boot error - Serdes data from BMC not available\n"); "Device boot error - Serdes data from BMC not available\n");
if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL) err_exists = true;
}
if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL) {
dev_err(hdev->dev, dev_err(hdev->dev,
"Device boot error - NIC F/W initialization failed\n"); "Device boot error - NIC F/W initialization failed\n");
if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY) err_exists = true;
}
if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY) {
dev_warn(hdev->dev, dev_warn(hdev->dev,
"Device boot warning - security not ready\n"); "Device boot warning - security not ready\n");
if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL) /* This is a warning so we don't want it to disable the
* device
*/
err_val &= ~CPU_BOOT_ERR0_SECURITY_NOT_RDY;
}
if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL) {
dev_err(hdev->dev, "Device boot error - security failure\n"); dev_err(hdev->dev, "Device boot error - security failure\n");
if (err_val & CPU_BOOT_ERR0_EFUSE_FAIL) err_exists = true;
}
if (err_val & CPU_BOOT_ERR0_EFUSE_FAIL) {
dev_err(hdev->dev, "Device boot error - eFuse failure\n"); dev_err(hdev->dev, "Device boot error - eFuse failure\n");
if (err_val & CPU_BOOT_ERR0_PLL_FAIL) err_exists = true;
}
if (err_val & CPU_BOOT_ERR0_PLL_FAIL) {
dev_err(hdev->dev, "Device boot error - PLL failure\n"); dev_err(hdev->dev, "Device boot error - PLL failure\n");
if (err_val & CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL) err_exists = true;
}
if (err_val & CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL) {
dev_err(hdev->dev, dev_err(hdev->dev,
"Device boot error - device unusable failure\n"); "Device boot error - device unusable\n");
err_exists = true;
}
security_val = RREG32(cpu_security_boot_status_reg); security_val = RREG32(cpu_security_boot_status_reg);
if (security_val & CPU_BOOT_DEV_STS0_ENABLED) if (security_val & CPU_BOOT_DEV_STS0_ENABLED)
dev_dbg(hdev->dev, "Device security status %#x\n", dev_dbg(hdev->dev, "Device security status %#x\n",
security_val); security_val);
if (err_val & ~CPU_BOOT_ERR0_ENABLED) if (!err_exists && (err_val & ~CPU_BOOT_ERR0_ENABLED)) {
dev_err(hdev->dev,
"Device boot error - unknown error 0x%08x\n",
err_val);
err_exists = true;
}
if (err_exists)
return -EIO; return -EIO;
return 0; return 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment