Commit 72d66255 authored by Ohad Sharabi's avatar Ohad Sharabi Committed by Oded Gabbay

habanalabs: modify multi-CS to wait on stream masters

During the integration, the multi-CS requirements were refined:
- The multi CS call shall wait on "per-ASIC" predefined stream masters
  instead of set of streams.
- Stream masters are set of QIDs used by the upper SW layers (synapse)
  for completion (must be an external/HW queue).
Signed-off-by: default avatarOhad Sharabi <osharabi@habana.ai>
Reviewed-by: default avatarOded Gabbay <ogabbay@kernel.org>
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent 1f6bdee7
...@@ -487,14 +487,15 @@ static void force_complete_multi_cs(struct hl_device *hdev) ...@@ -487,14 +487,15 @@ static void force_complete_multi_cs(struct hl_device *hdev)
* *
* @hdev: pointer to habanalabs device structure * @hdev: pointer to habanalabs device structure
* @cs: CS structure * @cs: CS structure
* * The function signals a waiting entity that has an overlapping stream masters
* The function signals waiting entity that its waiting stream has common * with the completed CS.
* stream with the completed CS.
* For example: * For example:
* - a completed CS worked on streams 0 and 1, multi CS completion * - a completed CS worked on stream master QID 4, multi CS completion
* is actively waiting on stream 3. don't send signal as no common stream * is actively waiting on stream master QIDs 3, 5. don't send signal as no
* - a completed CS worked on streams 0 and 1, multi CS completion * common stream master QID
* is actively waiting on streams 1 and 3. send signal as stream 1 is common * - a completed CS worked on stream master QID 4, multi CS completion
* is actively waiting on stream master QIDs 3, 4. send signal as stream
* master QID 4 is common
*/ */
static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs) static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs)
{ {
...@@ -518,10 +519,11 @@ static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs) ...@@ -518,10 +519,11 @@ static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs)
* complete if: * complete if:
* 1. still waiting for completion * 1. still waiting for completion
* 2. the completed CS has at least one overlapping stream * 2. the completed CS has at least one overlapping stream
* with the streams in the completion * master with the stream masters in the completion
*/ */
if (mcs_compl->used && if (mcs_compl->used &&
(fence->stream_map & mcs_compl->stream_map)) { (fence->stream_master_qid_map &
mcs_compl->stream_master_qid_map)) {
/* extract the timestamp only of first completed CS */ /* extract the timestamp only of first completed CS */
if (!mcs_compl->timestamp) if (!mcs_compl->timestamp)
mcs_compl->timestamp = mcs_compl->timestamp =
...@@ -1228,6 +1230,17 @@ static int cs_staged_submission(struct hl_device *hdev, struct hl_cs *cs, ...@@ -1228,6 +1230,17 @@ static int cs_staged_submission(struct hl_device *hdev, struct hl_cs *cs,
return 0; return 0;
} }
static u32 get_stream_master_qid_mask(struct hl_device *hdev, u32 qid)
{
int i;
for (i = 0; i < hdev->stream_master_qid_arr_size; i++)
if (qid == hdev->stream_master_qid_arr[i])
return BIT(i);
return 0;
}
static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
u32 num_chunks, u64 *cs_seq, u32 flags, u32 num_chunks, u64 *cs_seq, u32 flags,
u32 encaps_signals_handle, u32 timeout) u32 encaps_signals_handle, u32 timeout)
...@@ -1241,7 +1254,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, ...@@ -1241,7 +1254,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
struct hl_cs *cs; struct hl_cs *cs;
struct hl_cb *cb; struct hl_cb *cb;
u64 user_sequence; u64 user_sequence;
u8 stream_map = 0; u8 stream_master_qid_map = 0;
int rc, i; int rc, i;
cntr = &hdev->aggregated_cs_counters; cntr = &hdev->aggregated_cs_counters;
...@@ -1310,7 +1323,9 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, ...@@ -1310,7 +1323,9 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
* queues of this CS * queues of this CS
*/ */
if (hdev->supports_wait_for_multi_cs) if (hdev->supports_wait_for_multi_cs)
stream_map |= BIT((chunk->queue_index % 4)); stream_master_qid_map |=
get_stream_master_qid_mask(hdev,
chunk->queue_index);
} }
job = hl_cs_allocate_job(hdev, queue_type, job = hl_cs_allocate_job(hdev, queue_type,
...@@ -1378,7 +1393,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, ...@@ -1378,7 +1393,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
* fence object for multi-CS completion * fence object for multi-CS completion
*/ */
if (hdev->supports_wait_for_multi_cs) if (hdev->supports_wait_for_multi_cs)
cs->fence->stream_map = stream_map; cs->fence->stream_master_qid_map = stream_master_qid_map;
rc = hl_hw_queue_schedule_cs(cs); rc = hl_hw_queue_schedule_cs(cs);
if (rc) { if (rc) {
...@@ -2332,7 +2347,7 @@ static int hl_cs_poll_fences(struct multi_cs_data *mcs_data) ...@@ -2332,7 +2347,7 @@ static int hl_cs_poll_fences(struct multi_cs_data *mcs_data)
break; break;
} }
mcs_data->stream_map |= fence->stream_map; mcs_data->stream_master_qid_map |= fence->stream_master_qid_map;
if (status == CS_WAIT_STATUS_BUSY) if (status == CS_WAIT_STATUS_BUSY)
continue; continue;
...@@ -2394,7 +2409,8 @@ static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, ...@@ -2394,7 +2409,8 @@ static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
* hl_wait_multi_cs_completion_init - init completion structure * hl_wait_multi_cs_completion_init - init completion structure
* *
* @hdev: pointer to habanalabs device structure * @hdev: pointer to habanalabs device structure
* @stream_map: stream map, set bit indicates stream to wait on * @stream_master_bitmap: stream master QIDs map, set bit indicates stream
* master QID to wait on
* *
* @return valid completion struct pointer on success, otherwise error pointer * @return valid completion struct pointer on success, otherwise error pointer
* *
...@@ -2404,7 +2420,7 @@ static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, ...@@ -2404,7 +2420,7 @@ static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
*/ */
static struct multi_cs_completion *hl_wait_multi_cs_completion_init( static struct multi_cs_completion *hl_wait_multi_cs_completion_init(
struct hl_device *hdev, struct hl_device *hdev,
u8 stream_map) u8 stream_master_bitmap)
{ {
struct multi_cs_completion *mcs_compl; struct multi_cs_completion *mcs_compl;
int i; int i;
...@@ -2416,7 +2432,7 @@ static struct multi_cs_completion *hl_wait_multi_cs_completion_init( ...@@ -2416,7 +2432,7 @@ static struct multi_cs_completion *hl_wait_multi_cs_completion_init(
if (!mcs_compl->used) { if (!mcs_compl->used) {
mcs_compl->used = 1; mcs_compl->used = 1;
mcs_compl->timestamp = 0; mcs_compl->timestamp = 0;
mcs_compl->stream_map = stream_map; mcs_compl->stream_master_qid_map = stream_master_bitmap;
reinit_completion(&mcs_compl->completion); reinit_completion(&mcs_compl->completion);
spin_unlock(&mcs_compl->lock); spin_unlock(&mcs_compl->lock);
break; break;
...@@ -2464,7 +2480,7 @@ static int hl_wait_multi_cs_completion(struct multi_cs_data *mcs_data) ...@@ -2464,7 +2480,7 @@ static int hl_wait_multi_cs_completion(struct multi_cs_data *mcs_data)
long completion_rc; long completion_rc;
mcs_compl = hl_wait_multi_cs_completion_init(hdev, mcs_compl = hl_wait_multi_cs_completion_init(hdev,
mcs_data->stream_map); mcs_data->stream_master_qid_map);
if (IS_ERR(mcs_compl)) if (IS_ERR(mcs_compl))
return PTR_ERR(mcs_compl); return PTR_ERR(mcs_compl);
......
...@@ -592,18 +592,18 @@ struct asic_fixed_properties { ...@@ -592,18 +592,18 @@ struct asic_fixed_properties {
* @completion: fence is implemented using completion * @completion: fence is implemented using completion
* @refcount: refcount for this fence * @refcount: refcount for this fence
* @cs_sequence: sequence of the corresponding command submission * @cs_sequence: sequence of the corresponding command submission
* @stream_master_qid_map: streams masters QID bitmap to represent all streams
* masters QIDs that multi cs is waiting on
* @error: mark this fence with error * @error: mark this fence with error
* @timestamp: timestamp upon completion * @timestamp: timestamp upon completion
* @stream_map: streams bitmap to represent all streams that multi cs is
* waiting on
*/ */
struct hl_fence { struct hl_fence {
struct completion completion; struct completion completion;
struct kref refcount; struct kref refcount;
u64 cs_sequence; u64 cs_sequence;
u32 stream_master_qid_map;
int error; int error;
ktime_t timestamp; ktime_t timestamp;
u8 stream_map;
}; };
/** /**
...@@ -1160,6 +1160,7 @@ struct fw_load_mgr { ...@@ -1160,6 +1160,7 @@ struct fw_load_mgr {
* @state_dump_init: initialize constants required for state dump * @state_dump_init: initialize constants required for state dump
* @get_sob_addr: get SOB base address offset. * @get_sob_addr: get SOB base address offset.
* @set_pci_memory_regions: setting properties of PCI memory regions * @set_pci_memory_regions: setting properties of PCI memory regions
* @get_stream_master_qid_arr: get pointer to stream masters QID array
*/ */
struct hl_asic_funcs { struct hl_asic_funcs {
int (*early_init)(struct hl_device *hdev); int (*early_init)(struct hl_device *hdev);
...@@ -1289,6 +1290,7 @@ struct hl_asic_funcs { ...@@ -1289,6 +1290,7 @@ struct hl_asic_funcs {
void (*state_dump_init)(struct hl_device *hdev); void (*state_dump_init)(struct hl_device *hdev);
u32 (*get_sob_addr)(struct hl_device *hdev, u32 sob_id); u32 (*get_sob_addr)(struct hl_device *hdev, u32 sob_id);
void (*set_pci_memory_regions)(struct hl_device *hdev); void (*set_pci_memory_regions)(struct hl_device *hdev);
u32* (*get_stream_master_qid_arr)(void);
}; };
...@@ -2263,16 +2265,16 @@ struct hl_mmu_funcs { ...@@ -2263,16 +2265,16 @@ struct hl_mmu_funcs {
* @completion: completion of any of the CS in the list * @completion: completion of any of the CS in the list
* @lock: spinlock for the completion structure * @lock: spinlock for the completion structure
* @timestamp: timestamp for the multi-CS completion * @timestamp: timestamp for the multi-CS completion
* @used: 1 if in use, otherwise 0 * @stream_master_qid_map: bitmap of all stream masters on which the multi-CS
* @stream_map: bitmap of all HW/external queues streams on which the multi-CS
* is waiting * is waiting
* @used: 1 if in use, otherwise 0
*/ */
struct multi_cs_completion { struct multi_cs_completion {
struct completion completion; struct completion completion;
spinlock_t lock; spinlock_t lock;
s64 timestamp; s64 timestamp;
u32 stream_master_qid_map;
u8 used; u8 used;
u8 stream_map;
}; };
/** /**
...@@ -2284,9 +2286,9 @@ struct multi_cs_completion { ...@@ -2284,9 +2286,9 @@ struct multi_cs_completion {
* @timestamp: timestamp of first completed CS * @timestamp: timestamp of first completed CS
* @wait_status: wait for CS status * @wait_status: wait for CS status
* @completion_bitmap: bitmap of completed CSs (1- completed, otherwise 0) * @completion_bitmap: bitmap of completed CSs (1- completed, otherwise 0)
* @stream_master_qid_map: bitmap of all stream master QIDs on which the
* multi-CS is waiting
* @arr_len: fence_arr and seq_arr array length * @arr_len: fence_arr and seq_arr array length
* @stream_map: bitmap of all HW/external queues streams on which the multi-CS
* is waiting
* @gone_cs: indication of gone CS (1- there was gone CS, otherwise 0) * @gone_cs: indication of gone CS (1- there was gone CS, otherwise 0)
* @update_ts: update timestamp. 1- update the timestamp, otherwise 0. * @update_ts: update timestamp. 1- update the timestamp, otherwise 0.
*/ */
...@@ -2298,8 +2300,8 @@ struct multi_cs_data { ...@@ -2298,8 +2300,8 @@ struct multi_cs_data {
s64 timestamp; s64 timestamp;
long wait_status; long wait_status;
u32 completion_bitmap; u32 completion_bitmap;
u32 stream_master_qid_map;
u8 arr_len; u8 arr_len;
u8 stream_map;
u8 gone_cs; u8 gone_cs;
u8 update_ts; u8 update_ts;
}; };
...@@ -2520,6 +2522,7 @@ struct hl_device { ...@@ -2520,6 +2522,7 @@ struct hl_device {
struct multi_cs_completion multi_cs_completion[ struct multi_cs_completion multi_cs_completion[
MULTI_CS_MAX_USER_CTX]; MULTI_CS_MAX_USER_CTX];
u32 *stream_master_qid_arr;
atomic64_t dram_used_mem; atomic64_t dram_used_mem;
u64 timeout_jiffies; u64 timeout_jiffies;
u64 max_power; u64 max_power;
...@@ -2570,6 +2573,7 @@ struct hl_device { ...@@ -2570,6 +2573,7 @@ struct hl_device {
u8 skip_reset_on_timeout; u8 skip_reset_on_timeout;
u8 device_cpu_is_halted; u8 device_cpu_is_halted;
u8 supports_wait_for_multi_cs; u8 supports_wait_for_multi_cs;
u8 stream_master_qid_arr_size;
/* Parameters for bring-up */ /* Parameters for bring-up */
u64 nic_ports_mask; u64 nic_ports_mask;
......
...@@ -721,7 +721,8 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs) ...@@ -721,7 +721,8 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
/* update stream map of the first CS */ /* update stream map of the first CS */
if (hdev->supports_wait_for_multi_cs) if (hdev->supports_wait_for_multi_cs)
staged_cs->fence->stream_map |= cs->fence->stream_map; staged_cs->fence->stream_master_qid_map |=
cs->fence->stream_master_qid_map;
} }
list_add_tail(&cs->mirror_node, &hdev->cs_mirror_list); list_add_tail(&cs->mirror_node, &hdev->cs_mirror_list);
......
...@@ -110,6 +110,17 @@ ...@@ -110,6 +110,17 @@
#define MONITOR_SOB_STRING_SIZE 256 #define MONITOR_SOB_STRING_SIZE 256
static u32 gaudi_stream_master[GAUDI_STREAM_MASTER_ARR_SIZE] = {
GAUDI_QUEUE_ID_DMA_0_0,
GAUDI_QUEUE_ID_DMA_0_1,
GAUDI_QUEUE_ID_DMA_0_2,
GAUDI_QUEUE_ID_DMA_0_3,
GAUDI_QUEUE_ID_DMA_1_0,
GAUDI_QUEUE_ID_DMA_1_1,
GAUDI_QUEUE_ID_DMA_1_2,
GAUDI_QUEUE_ID_DMA_1_3
};
static const char gaudi_irq_name[GAUDI_MSI_ENTRIES][GAUDI_MAX_STRING_LEN] = { static const char gaudi_irq_name[GAUDI_MSI_ENTRIES][GAUDI_MAX_STRING_LEN] = {
"gaudi cq 0_0", "gaudi cq 0_1", "gaudi cq 0_2", "gaudi cq 0_3", "gaudi cq 0_0", "gaudi cq 0_1", "gaudi cq 0_2", "gaudi cq 0_3",
"gaudi cq 1_0", "gaudi cq 1_1", "gaudi cq 1_2", "gaudi cq 1_3", "gaudi cq 1_0", "gaudi cq 1_1", "gaudi cq 1_2", "gaudi cq 1_3",
...@@ -1870,6 +1881,9 @@ static int gaudi_sw_init(struct hl_device *hdev) ...@@ -1870,6 +1881,9 @@ static int gaudi_sw_init(struct hl_device *hdev)
hdev->supports_wait_for_multi_cs = true; hdev->supports_wait_for_multi_cs = true;
hdev->asic_funcs->set_pci_memory_regions(hdev); hdev->asic_funcs->set_pci_memory_regions(hdev);
hdev->stream_master_qid_arr =
hdev->asic_funcs->get_stream_master_qid_arr();
hdev->stream_master_qid_arr_size = GAUDI_STREAM_MASTER_ARR_SIZE;
return 0; return 0;
...@@ -9352,6 +9366,11 @@ static void gaudi_state_dump_init(struct hl_device *hdev) ...@@ -9352,6 +9366,11 @@ static void gaudi_state_dump_init(struct hl_device *hdev)
sds->funcs = gaudi_state_dump_funcs; sds->funcs = gaudi_state_dump_funcs;
} }
static u32 *gaudi_get_stream_master_qid_arr(void)
{
return gaudi_stream_master;
}
static const struct hl_asic_funcs gaudi_funcs = { static const struct hl_asic_funcs gaudi_funcs = {
.early_init = gaudi_early_init, .early_init = gaudi_early_init,
.early_fini = gaudi_early_fini, .early_fini = gaudi_early_fini,
...@@ -9440,7 +9459,8 @@ static const struct hl_asic_funcs gaudi_funcs = { ...@@ -9440,7 +9459,8 @@ static const struct hl_asic_funcs gaudi_funcs = {
.init_cpu_scrambler_dram = gaudi_init_scrambler_hbm, .init_cpu_scrambler_dram = gaudi_init_scrambler_hbm,
.state_dump_init = gaudi_state_dump_init, .state_dump_init = gaudi_state_dump_init,
.get_sob_addr = gaudi_get_sob_addr, .get_sob_addr = gaudi_get_sob_addr,
.set_pci_memory_regions = gaudi_set_pci_memory_regions .set_pci_memory_regions = gaudi_set_pci_memory_regions,
.get_stream_master_qid_arr = gaudi_get_stream_master_qid_arr
}; };
/** /**
......
...@@ -36,6 +36,8 @@ ...@@ -36,6 +36,8 @@
#define NUMBER_OF_INTERRUPTS (NUMBER_OF_CMPLT_QUEUES + \ #define NUMBER_OF_INTERRUPTS (NUMBER_OF_CMPLT_QUEUES + \
NUMBER_OF_CPU_HW_QUEUES) NUMBER_OF_CPU_HW_QUEUES)
#define GAUDI_STREAM_MASTER_ARR_SIZE 8
#if (NUMBER_OF_INTERRUPTS > GAUDI_MSI_ENTRIES) #if (NUMBER_OF_INTERRUPTS > GAUDI_MSI_ENTRIES)
#error "Number of MSI interrupts must be smaller or equal to GAUDI_MSI_ENTRIES" #error "Number of MSI interrupts must be smaller or equal to GAUDI_MSI_ENTRIES"
#endif #endif
......
...@@ -5588,6 +5588,11 @@ static u32 goya_get_sob_addr(struct hl_device *hdev, u32 sob_id) ...@@ -5588,6 +5588,11 @@ static u32 goya_get_sob_addr(struct hl_device *hdev, u32 sob_id)
return 0; return 0;
} }
static u32 *goya_get_stream_master_qid_arr(void)
{
return NULL;
}
static const struct hl_asic_funcs goya_funcs = { static const struct hl_asic_funcs goya_funcs = {
.early_init = goya_early_init, .early_init = goya_early_init,
.early_fini = goya_early_fini, .early_fini = goya_early_fini,
...@@ -5677,6 +5682,7 @@ static const struct hl_asic_funcs goya_funcs = { ...@@ -5677,6 +5682,7 @@ static const struct hl_asic_funcs goya_funcs = {
.state_dump_init = goya_state_dump_init, .state_dump_init = goya_state_dump_init,
.get_sob_addr = &goya_get_sob_addr, .get_sob_addr = &goya_get_sob_addr,
.set_pci_memory_regions = goya_set_pci_memory_regions, .set_pci_memory_regions = goya_set_pci_memory_regions,
.get_stream_master_qid_arr = goya_get_stream_master_qid_arr,
}; };
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment