Commit 71731090 authored by Omer Shpigelman's avatar Omer Shpigelman Committed by Oded Gabbay

habanalabs: add "in device creation" status

On init, the disabled state is cleared right before hw_init and that
causes the device to report on "Operational" state before the device
initialization is finished. Although the char device is not yet exposed
to the user at this stage, the sysfs entries are exposed.

This can cause errors in monitoring applications that use the sysfs
entries.

In order to avoid this, a new state "in device creation" is introduced
to ne reported when the device is not disabled but is still in init
flow.
Signed-off-by: default avatarOmer Shpigelman <oshpigelman@habana.ai>
Reviewed-by: default avatarOded Gabbay <ogabbay@kernel.org>
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent e1b61f8e
...@@ -23,6 +23,8 @@ enum hl_device_status hl_device_status(struct hl_device *hdev) ...@@ -23,6 +23,8 @@ enum hl_device_status hl_device_status(struct hl_device *hdev)
status = HL_DEVICE_STATUS_NEEDS_RESET; status = HL_DEVICE_STATUS_NEEDS_RESET;
else if (hdev->disabled) else if (hdev->disabled)
status = HL_DEVICE_STATUS_MALFUNCTION; status = HL_DEVICE_STATUS_MALFUNCTION;
else if (!hdev->init_done)
status = HL_DEVICE_STATUS_IN_DEVICE_CREATION;
else else
status = HL_DEVICE_STATUS_OPERATIONAL; status = HL_DEVICE_STATUS_OPERATIONAL;
...@@ -44,6 +46,7 @@ bool hl_device_operational(struct hl_device *hdev, ...@@ -44,6 +46,7 @@ bool hl_device_operational(struct hl_device *hdev,
case HL_DEVICE_STATUS_NEEDS_RESET: case HL_DEVICE_STATUS_NEEDS_RESET:
return false; return false;
case HL_DEVICE_STATUS_OPERATIONAL: case HL_DEVICE_STATUS_OPERATIONAL:
case HL_DEVICE_STATUS_IN_DEVICE_CREATION:
default: default:
return true; return true;
} }
......
...@@ -1990,7 +1990,7 @@ struct hl_state_dump_specs { ...@@ -1990,7 +1990,7 @@ struct hl_state_dump_specs {
#define HL_STR_MAX 32 #define HL_STR_MAX 32
#define HL_DEV_STS_MAX (HL_DEVICE_STATUS_NEEDS_RESET + 1) #define HL_DEV_STS_MAX (HL_DEVICE_STATUS_LAST + 1)
/* Theoretical limit only. A single host can only contain up to 4 or 8 PCIe /* Theoretical limit only. A single host can only contain up to 4 or 8 PCIe
* x16 cards. In extreme cases, there are hosts that can accommodate 16 cards. * x16 cards. In extreme cases, there are hosts that can accommodate 16 cards.
......
...@@ -317,12 +317,16 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev, ...@@ -317,12 +317,16 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
hdev->asic_prop.fw_security_enabled = false; hdev->asic_prop.fw_security_enabled = false;
/* Assign status description string */ /* Assign status description string */
strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION], strncpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL],
"disabled", HL_STR_MAX); "operational", HL_STR_MAX);
strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET], strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET],
"in reset", HL_STR_MAX); "in reset", HL_STR_MAX);
strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION],
"disabled", HL_STR_MAX);
strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET], strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET],
"needs reset", HL_STR_MAX); "needs reset", HL_STR_MAX);
strncpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION],
"in device creation", HL_STR_MAX);
hdev->major = hl_major; hdev->major = hl_major;
hdev->reset_on_lockup = reset_on_lockup; hdev->reset_on_lockup = reset_on_lockup;
......
...@@ -9,8 +9,7 @@ ...@@ -9,8 +9,7 @@
#include <linux/pci.h> #include <linux/pci.h>
long hl_get_frequency(struct hl_device *hdev, u32 pll_index, long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)
bool curr)
{ {
struct cpucp_packet pkt; struct cpucp_packet pkt;
u32 used_pll_idx; u32 used_pll_idx;
...@@ -44,8 +43,7 @@ long hl_get_frequency(struct hl_device *hdev, u32 pll_index, ...@@ -44,8 +43,7 @@ long hl_get_frequency(struct hl_device *hdev, u32 pll_index,
return (long) result; return (long) result;
} }
void hl_set_frequency(struct hl_device *hdev, u32 pll_index, void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq)
u64 freq)
{ {
struct cpucp_packet pkt; struct cpucp_packet pkt;
u32 used_pll_idx; u32 used_pll_idx;
...@@ -285,16 +283,12 @@ static ssize_t status_show(struct device *dev, struct device_attribute *attr, ...@@ -285,16 +283,12 @@ static ssize_t status_show(struct device *dev, struct device_attribute *attr,
char *buf) char *buf)
{ {
struct hl_device *hdev = dev_get_drvdata(dev); struct hl_device *hdev = dev_get_drvdata(dev);
char *str; char str[HL_STR_MAX];
if (atomic_read(&hdev->in_reset)) strscpy(str, hdev->status[hl_device_status(hdev)], HL_STR_MAX);
str = "In reset";
else if (hdev->disabled) /* use uppercase for backward compatibility */
str = "Malfunction"; str[0] = 'A' + (str[0] - 'a');
else if (hdev->needs_reset)
str = "Needs Reset";
else
str = "Operational";
return sprintf(buf, "%s\n", str); return sprintf(buf, "%s\n", str);
} }
......
...@@ -276,7 +276,9 @@ enum hl_device_status { ...@@ -276,7 +276,9 @@ enum hl_device_status {
HL_DEVICE_STATUS_OPERATIONAL, HL_DEVICE_STATUS_OPERATIONAL,
HL_DEVICE_STATUS_IN_RESET, HL_DEVICE_STATUS_IN_RESET,
HL_DEVICE_STATUS_MALFUNCTION, HL_DEVICE_STATUS_MALFUNCTION,
HL_DEVICE_STATUS_NEEDS_RESET HL_DEVICE_STATUS_NEEDS_RESET,
HL_DEVICE_STATUS_IN_DEVICE_CREATION,
HL_DEVICE_STATUS_LAST = HL_DEVICE_STATUS_IN_DEVICE_CREATION
}; };
enum hl_server_type { enum hl_server_type {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment