Commit 215f0c17 authored by Ohad Sharabi's avatar Ohad Sharabi Committed by Oded Gabbay

habanalabs: add wait-for-multi-CS uAPI

When user sends multiple CSs, waiting for each CS is not efficient
as it involves many user-kernel context switches.

In order to address this issue we add support to "wait on multiple CSs"
using a new uAPI which can wait on maximum of 32 CSs. The new uAPI is
defined using a new flag - WAIT_FOR_MULTI_CS - in the wait_for_cs IOCTL.

The input parameters for this uAPI will be:
@seq: user pointer to an array of up to 32 CS's sequence numbers.
@seq_array_len: length of sequence array.
@timeout_us: timeout for waiting for any CS.

The output paramateres for this API will be:
@status: multi CS ioctl completion status (dedicated status was added as
         well).
@flags: bitmap of output flags of the CS.
@cs_completion_map: bitmap for multi CS, if CS sequence that was placed
                    in index N in input seq array has completed- the N-th
		    bit in cs_completion_map will be 1, otherwise it will
		    be 0.
@timestamp_nsec: timestamp of the first completed CS
Signed-off-by: default avatarOhad Sharabi <osharabi@habana.ai>
Reviewed-by: default avatarOded Gabbay <ogabbay@kernel.org>
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent c457d5ab
......@@ -229,7 +229,17 @@ int hl_ctx_put(struct hl_ctx *ctx)
return kref_put(&ctx->refcount, hl_ctx_do_release);
}
/* this function shall be called with cs_lock locked */
/*
* hl_ctx_get_fence_locked - get CS fence under CS lock
*
* @ctx: pointer to the context structure.
* @seq: CS sequences number
*
* @return valid fence pointer on success, NULL if fence is gone, otherwise
* error pointer.
*
* NOTE: this function shall be called with cs_lock locked
*/
static struct hl_fence *hl_ctx_get_fence_locked(struct hl_ctx *ctx, u64 seq)
{
struct asic_fixed_properties *asic_prop = &ctx->hdev->asic_prop;
......@@ -259,6 +269,16 @@ struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
return fence;
}
/*
* hl_ctx_get_fences - get multiple CS fences under the same CS lock
*
* @ctx: pointer to the context structure.
* @seq_arr: array of CS sequences to wait for
* @fence: fence array to store the CS fences
* @arr_len: length of seq_arr and fence_arr
*
* @return 0 on success, otherwise non 0 error code
*/
int hl_ctx_get_fences(struct hl_ctx *ctx, u64 *seq_arr,
struct hl_fence **fence, u32 arr_len)
{
......
......@@ -1297,6 +1297,10 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
if (rc)
goto user_interrupts_fini;
/* initialize completion structure for multi CS wait */
hl_multi_cs_completion_init(hdev);
/*
* Initialize the H/W queues. Must be done before hw_init, because
* there the addresses of the kernel queue are being written to the
......
......@@ -585,7 +585,8 @@ struct asic_fixed_properties {
* @cs_sequence: sequence of the corresponding command submission
* @error: mark this fence with error
* @timestamp: timestamp upon completion
*
* @stream_map: streams bitmap to represent all streams that multi cs is
* waiting on
*/
struct hl_fence {
struct completion completion;
......@@ -593,6 +594,7 @@ struct hl_fence {
u64 cs_sequence;
int error;
ktime_t timestamp;
u8 stream_map;
};
/**
......@@ -2234,6 +2236,58 @@ struct hl_mmu_funcs {
u64 virt_addr, struct hl_mmu_hop_info *hops);
};
/**
* number of user contexts allowed to call wait_for_multi_cs ioctl in
* parallel
*/
#define MULTI_CS_MAX_USER_CTX 2
/**
* struct multi_cs_completion - multi CS wait completion.
* @completion: completion of any of the CS in the list
* @lock: spinlock for the completion structure
* @timestamp: timestamp for the multi-CS completion
* @used: 1 if in use, otherwise 0
* @stream_map: bitmap of all HW/external queues streams on which the multi-CS
* is waiting
*/
struct multi_cs_completion {
struct completion completion;
spinlock_t lock;
s64 timestamp;
u8 used;
u8 stream_map;
};
/**
* struct multi_cs_data - internal data for multi CS call
* @ctx: pointer to the context structure
* @fence_arr: array of fences of all CSs
* @seq_arr: array of CS sequence numbers
* @timeout_us: timeout in usec for waiting for CS to complete
* @timestamp: timestamp of first completed CS
* @wait_status: wait for CS status
* @completion_bitmap: bitmap of completed CSs (1- completed, otherwise 0)
* @arr_len: fence_arr and seq_arr array length
* @stream_map: bitmap of all HW/external queues streams on which the multi-CS
* is waiting
* @gone_cs: indication of gone CS (1- there was gone CS, otherwise 0)
* @update_ts: update timestamp. 1- update the timestamp, otherwise 0.
*/
struct multi_cs_data {
struct hl_ctx *ctx;
struct hl_fence **fence_arr;
u64 *seq_arr;
s64 timeout_us;
s64 timestamp;
long wait_status;
u32 completion_bitmap;
u8 arr_len;
u8 stream_map;
u8 gone_cs;
u8 update_ts;
};
/**
* struct hl_device - habanalabs device structure.
* @pdev: pointer to PCI device, can be NULL in case of simulator device.
......@@ -2300,6 +2354,7 @@ struct hl_mmu_funcs {
* @fw_loader: FW loader manager.
* @pci_mem_region: array of memory regions in the PCI
* @state_dump_specs: constants and dictionaries needed to dump system state.
* @multi_cs_completion: array of multi-CS completion.
* @dram_used_mem: current DRAM memory consumption.
* @timeout_jiffies: device CS timeout value.
* @max_power: the max power of the device, as configured by the sysadmin. This
......@@ -2376,6 +2431,7 @@ struct hl_mmu_funcs {
* halted. We can't halt it again because the COMMS
* protocol will throw an error. Relevant only for
* cases where Linux was not loaded to device CPU
* @supports_wait_for_multi_cs: true if wait for multi CS is supported
*/
struct hl_device {
struct pci_dev *pdev;
......@@ -2446,6 +2502,9 @@ struct hl_device {
struct hl_state_dump_specs state_dump_specs;
struct multi_cs_completion multi_cs_completion[
MULTI_CS_MAX_USER_CTX];
atomic64_t dram_used_mem;
u64 timeout_jiffies;
u64 max_power;
......@@ -2495,6 +2554,7 @@ struct hl_device {
u8 curr_reset_cause;
u8 skip_reset_on_timeout;
u8 device_cpu_is_halted;
u8 supports_wait_for_multi_cs;
/* Parameters for bring-up */
u64 nic_ports_mask;
......@@ -2701,6 +2761,7 @@ bool cs_needs_completion(struct hl_cs *cs);
bool cs_needs_timeout(struct hl_cs *cs);
bool is_staged_cs_last_exists(struct hl_device *hdev, struct hl_cs *cs);
struct hl_cs *hl_staged_cs_find_first(struct hl_device *hdev, u64 cs_seq);
void hl_multi_cs_completion_init(struct hl_device *hdev);
void goya_set_asic_funcs(struct hl_device *hdev);
void gaudi_set_asic_funcs(struct hl_device *hdev);
......
......@@ -603,6 +603,10 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
}
list_add_tail(&cs->staged_cs_node, &staged_cs->staged_cs_node);
/* update stream map of the first CS */
if (hdev->supports_wait_for_multi_cs)
staged_cs->fence->stream_map |= cs->fence->stream_map;
}
list_add_tail(&cs->mirror_node, &hdev->cs_mirror_list);
......
......@@ -1814,6 +1814,7 @@ static int gaudi_sw_init(struct hl_device *hdev)
hdev->supports_sync_stream = true;
hdev->supports_coresight = true;
hdev->supports_staged_submission = true;
hdev->supports_wait_for_multi_cs = true;
gaudi_set_pci_memory_regions(hdev);
......
......@@ -958,6 +958,7 @@ static int goya_sw_init(struct hl_device *hdev)
hdev->supports_coresight = true;
hdev->supports_soft_reset = true;
hdev->allow_external_soft_reset = true;
hdev->supports_wait_for_multi_cs = false;
goya_set_pci_memory_regions(hdev);
......
......@@ -735,11 +735,18 @@ union hl_cs_args {
#define HL_WAIT_CS_FLAGS_INTERRUPT 0x2
#define HL_WAIT_CS_FLAGS_INTERRUPT_MASK 0xFFF00000
#define HL_WAIT_CS_FLAGS_MULTI_CS 0x4
#define HL_WAIT_MULTI_CS_LIST_MAX_LEN 32
struct hl_wait_cs_in {
union {
struct {
/* Command submission sequence number */
/*
* In case of wait_cs holds the CS sequence number.
* In case of wait for multi CS hold a user pointer to
* an array of CS sequence numbers
*/
__u64 seq;
/* Absolute timeout to wait for command submission
* in microseconds
......@@ -767,12 +774,17 @@ struct hl_wait_cs_in {
/* Context ID - Currently not in use */
__u32 ctx_id;
/* HL_WAIT_CS_FLAGS_*
* If HL_WAIT_CS_FLAGS_INTERRUPT is set, this field should include
* interrupt id according to HL_WAIT_CS_FLAGS_INTERRUPT_MASK, in order
* not to specify an interrupt id ,set mask to all 1s.
*/
__u32 flags;
/* Multi CS API info- valid entries in multi-CS array */
__u8 seq_arr_len;
__u8 pad[7];
};
#define HL_WAIT_CS_STATUS_COMPLETED 0
......@@ -789,8 +801,15 @@ struct hl_wait_cs_out {
__u32 status;
/* HL_WAIT_CS_STATUS_FLAG* */
__u32 flags;
/* valid only if HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD is set */
/*
* valid only if HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD is set
* for wait_cs: timestamp of CS completion
* for wait_multi_cs: timestamp of FIRST CS completion
*/
__s64 timestamp_nsec;
/* multi CS completion bitmap */
__u32 cs_completion_map;
__u32 pad;
};
union hl_wait_cs_args {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment