Commit cd6b0cea authored by Oded Gabbay's avatar Oded Gabbay

habanalabs/gaudi: increase default cs timeout to 10 minutes

In order to improve scalability and reduce host overhead, it is better
to increase the default TDR timeout of Gaudi1 from 30 seconds to
10 minutes.

This will allow the DL Framework (e.g. PyTorch, TensorFlow) to remove
the host sync they are using now and improve overall performance on
scaleout training.

Note that one can always set the timeout to a custom value via
a kernel module parameter given during driver load.
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent 913bd417
...@@ -27,7 +27,10 @@ static struct class *hl_class; ...@@ -27,7 +27,10 @@ static struct class *hl_class;
static DEFINE_IDR(hl_devs_idr); static DEFINE_IDR(hl_devs_idr);
static DEFINE_MUTEX(hl_devs_idr_lock); static DEFINE_MUTEX(hl_devs_idr_lock);
static int timeout_locked = 30; #define HL_DEFAULT_TIMEOUT_LOCKED 30 /* 30 seconds */
#define GAUDI_DEFAULT_TIMEOUT_LOCKED 600 /* 10 minutes */
static int timeout_locked = HL_DEFAULT_TIMEOUT_LOCKED;
static int reset_on_lockup = 1; static int reset_on_lockup = 1;
static int memory_scrub; static int memory_scrub;
static ulong boot_error_status_mask = ULONG_MAX; static ulong boot_error_status_mask = ULONG_MAX;
...@@ -314,12 +317,22 @@ static void copy_kernel_module_params_to_device(struct hl_device *hdev) ...@@ -314,12 +317,22 @@ static void copy_kernel_module_params_to_device(struct hl_device *hdev)
hdev->boot_error_status_mask = boot_error_status_mask; hdev->boot_error_status_mask = boot_error_status_mask;
} }
static void fixup_device_params_per_asic(struct hl_device *hdev) static void fixup_device_params_per_asic(struct hl_device *hdev, int timeout)
{ {
switch (hdev->asic_type) { switch (hdev->asic_type) {
case ASIC_GOYA:
case ASIC_GAUDI: case ASIC_GAUDI:
case ASIC_GAUDI_SEC: case ASIC_GAUDI_SEC:
/* If user didn't request a different timeout than the default one, we have
* a different default timeout for Gaudi
*/
if (timeout == HL_DEFAULT_TIMEOUT_LOCKED)
hdev->timeout_jiffies = msecs_to_jiffies(GAUDI_DEFAULT_TIMEOUT_LOCKED *
MSEC_PER_SEC);
hdev->reset_upon_device_release = 0;
break;
case ASIC_GOYA:
hdev->reset_upon_device_release = 0; hdev->reset_upon_device_release = 0;
break; break;
...@@ -339,7 +352,7 @@ static int fixup_device_params(struct hl_device *hdev) ...@@ -339,7 +352,7 @@ static int fixup_device_params(struct hl_device *hdev)
hdev->fw_comms_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC; hdev->fw_comms_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
if (tmp_timeout) if (tmp_timeout)
hdev->timeout_jiffies = msecs_to_jiffies(tmp_timeout * 1000); hdev->timeout_jiffies = msecs_to_jiffies(tmp_timeout * MSEC_PER_SEC);
else else
hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT; hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
...@@ -360,7 +373,7 @@ static int fixup_device_params(struct hl_device *hdev) ...@@ -360,7 +373,7 @@ static int fixup_device_params(struct hl_device *hdev)
if (!hdev->cpu_queues_enable) if (!hdev->cpu_queues_enable)
hdev->heartbeat = 0; hdev->heartbeat = 0;
fixup_device_params_per_asic(hdev); fixup_device_params_per_asic(hdev, tmp_timeout);
return 0; return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment