Commit 215f0c17 authored by Ohad Sharabi's avatar Ohad Sharabi Committed by Oded Gabbay

habanalabs: add wait-for-multi-CS uAPI

When user sends multiple CSs, waiting for each CS is not efficient
as it involves many user-kernel context switches.

In order to address this issue we add support to "wait on multiple CSs"
using a new uAPI which can wait on maximum of 32 CSs. The new uAPI is
defined using a new flag - WAIT_FOR_MULTI_CS - in the wait_for_cs IOCTL.

The input parameters for this uAPI will be:
@seq: user pointer to an array of up to 32 CS's sequence numbers.
@seq_array_len: length of sequence array.
@timeout_us: timeout for waiting for any CS.

The output paramateres for this API will be:
@status: multi CS ioctl completion status (dedicated status was added as
         well).
@flags: bitmap of output flags of the CS.
@cs_completion_map: bitmap for multi CS, if CS sequence that was placed
                    in index N in input seq array has completed- the N-th
		    bit in cs_completion_map will be 1, otherwise it will
		    be 0.
@timestamp_nsec: timestamp of the first completed CS
Signed-off-by: default avatarOhad Sharabi <osharabi@habana.ai>
Reviewed-by: default avatarOded Gabbay <ogabbay@kernel.org>
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent c457d5ab
...@@ -229,7 +229,17 @@ int hl_ctx_put(struct hl_ctx *ctx) ...@@ -229,7 +229,17 @@ int hl_ctx_put(struct hl_ctx *ctx)
return kref_put(&ctx->refcount, hl_ctx_do_release); return kref_put(&ctx->refcount, hl_ctx_do_release);
} }
/* this function shall be called with cs_lock locked */ /*
* hl_ctx_get_fence_locked - get CS fence under CS lock
*
* @ctx: pointer to the context structure.
* @seq: CS sequences number
*
* @return valid fence pointer on success, NULL if fence is gone, otherwise
* error pointer.
*
* NOTE: this function shall be called with cs_lock locked
*/
static struct hl_fence *hl_ctx_get_fence_locked(struct hl_ctx *ctx, u64 seq) static struct hl_fence *hl_ctx_get_fence_locked(struct hl_ctx *ctx, u64 seq)
{ {
struct asic_fixed_properties *asic_prop = &ctx->hdev->asic_prop; struct asic_fixed_properties *asic_prop = &ctx->hdev->asic_prop;
...@@ -259,6 +269,16 @@ struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq) ...@@ -259,6 +269,16 @@ struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
return fence; return fence;
} }
/*
* hl_ctx_get_fences - get multiple CS fences under the same CS lock
*
* @ctx: pointer to the context structure.
* @seq_arr: array of CS sequences to wait for
* @fence: fence array to store the CS fences
* @arr_len: length of seq_arr and fence_arr
*
* @return 0 on success, otherwise non 0 error code
*/
int hl_ctx_get_fences(struct hl_ctx *ctx, u64 *seq_arr, int hl_ctx_get_fences(struct hl_ctx *ctx, u64 *seq_arr,
struct hl_fence **fence, u32 arr_len) struct hl_fence **fence, u32 arr_len)
{ {
......
...@@ -1297,6 +1297,10 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) ...@@ -1297,6 +1297,10 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
if (rc) if (rc)
goto user_interrupts_fini; goto user_interrupts_fini;
/* initialize completion structure for multi CS wait */
hl_multi_cs_completion_init(hdev);
/* /*
* Initialize the H/W queues. Must be done before hw_init, because * Initialize the H/W queues. Must be done before hw_init, because
* there the addresses of the kernel queue are being written to the * there the addresses of the kernel queue are being written to the
......
...@@ -585,7 +585,8 @@ struct asic_fixed_properties { ...@@ -585,7 +585,8 @@ struct asic_fixed_properties {
* @cs_sequence: sequence of the corresponding command submission * @cs_sequence: sequence of the corresponding command submission
* @error: mark this fence with error * @error: mark this fence with error
* @timestamp: timestamp upon completion * @timestamp: timestamp upon completion
* * @stream_map: streams bitmap to represent all streams that multi cs is
* waiting on
*/ */
struct hl_fence { struct hl_fence {
struct completion completion; struct completion completion;
...@@ -593,6 +594,7 @@ struct hl_fence { ...@@ -593,6 +594,7 @@ struct hl_fence {
u64 cs_sequence; u64 cs_sequence;
int error; int error;
ktime_t timestamp; ktime_t timestamp;
u8 stream_map;
}; };
/** /**
...@@ -2234,6 +2236,58 @@ struct hl_mmu_funcs { ...@@ -2234,6 +2236,58 @@ struct hl_mmu_funcs {
u64 virt_addr, struct hl_mmu_hop_info *hops); u64 virt_addr, struct hl_mmu_hop_info *hops);
}; };
/**
* number of user contexts allowed to call wait_for_multi_cs ioctl in
* parallel
*/
#define MULTI_CS_MAX_USER_CTX 2
/**
* struct multi_cs_completion - multi CS wait completion.
* @completion: completion of any of the CS in the list
* @lock: spinlock for the completion structure
* @timestamp: timestamp for the multi-CS completion
* @used: 1 if in use, otherwise 0
* @stream_map: bitmap of all HW/external queues streams on which the multi-CS
* is waiting
*/
struct multi_cs_completion {
struct completion completion;
spinlock_t lock;
s64 timestamp;
u8 used;
u8 stream_map;
};
/**
* struct multi_cs_data - internal data for multi CS call
* @ctx: pointer to the context structure
* @fence_arr: array of fences of all CSs
* @seq_arr: array of CS sequence numbers
* @timeout_us: timeout in usec for waiting for CS to complete
* @timestamp: timestamp of first completed CS
* @wait_status: wait for CS status
* @completion_bitmap: bitmap of completed CSs (1- completed, otherwise 0)
* @arr_len: fence_arr and seq_arr array length
* @stream_map: bitmap of all HW/external queues streams on which the multi-CS
* is waiting
* @gone_cs: indication of gone CS (1- there was gone CS, otherwise 0)
* @update_ts: update timestamp. 1- update the timestamp, otherwise 0.
*/
struct multi_cs_data {
struct hl_ctx *ctx;
struct hl_fence **fence_arr;
u64 *seq_arr;
s64 timeout_us;
s64 timestamp;
long wait_status;
u32 completion_bitmap;
u8 arr_len;
u8 stream_map;
u8 gone_cs;
u8 update_ts;
};
/** /**
* struct hl_device - habanalabs device structure. * struct hl_device - habanalabs device structure.
* @pdev: pointer to PCI device, can be NULL in case of simulator device. * @pdev: pointer to PCI device, can be NULL in case of simulator device.
...@@ -2300,6 +2354,7 @@ struct hl_mmu_funcs { ...@@ -2300,6 +2354,7 @@ struct hl_mmu_funcs {
* @fw_loader: FW loader manager. * @fw_loader: FW loader manager.
* @pci_mem_region: array of memory regions in the PCI * @pci_mem_region: array of memory regions in the PCI
* @state_dump_specs: constants and dictionaries needed to dump system state. * @state_dump_specs: constants and dictionaries needed to dump system state.
* @multi_cs_completion: array of multi-CS completion.
* @dram_used_mem: current DRAM memory consumption. * @dram_used_mem: current DRAM memory consumption.
* @timeout_jiffies: device CS timeout value. * @timeout_jiffies: device CS timeout value.
* @max_power: the max power of the device, as configured by the sysadmin. This * @max_power: the max power of the device, as configured by the sysadmin. This
...@@ -2376,6 +2431,7 @@ struct hl_mmu_funcs { ...@@ -2376,6 +2431,7 @@ struct hl_mmu_funcs {
* halted. We can't halt it again because the COMMS * halted. We can't halt it again because the COMMS
* protocol will throw an error. Relevant only for * protocol will throw an error. Relevant only for
* cases where Linux was not loaded to device CPU * cases where Linux was not loaded to device CPU
* @supports_wait_for_multi_cs: true if wait for multi CS is supported
*/ */
struct hl_device { struct hl_device {
struct pci_dev *pdev; struct pci_dev *pdev;
...@@ -2446,6 +2502,9 @@ struct hl_device { ...@@ -2446,6 +2502,9 @@ struct hl_device {
struct hl_state_dump_specs state_dump_specs; struct hl_state_dump_specs state_dump_specs;
struct multi_cs_completion multi_cs_completion[
MULTI_CS_MAX_USER_CTX];
atomic64_t dram_used_mem; atomic64_t dram_used_mem;
u64 timeout_jiffies; u64 timeout_jiffies;
u64 max_power; u64 max_power;
...@@ -2495,6 +2554,7 @@ struct hl_device { ...@@ -2495,6 +2554,7 @@ struct hl_device {
u8 curr_reset_cause; u8 curr_reset_cause;
u8 skip_reset_on_timeout; u8 skip_reset_on_timeout;
u8 device_cpu_is_halted; u8 device_cpu_is_halted;
u8 supports_wait_for_multi_cs;
/* Parameters for bring-up */ /* Parameters for bring-up */
u64 nic_ports_mask; u64 nic_ports_mask;
...@@ -2701,6 +2761,7 @@ bool cs_needs_completion(struct hl_cs *cs); ...@@ -2701,6 +2761,7 @@ bool cs_needs_completion(struct hl_cs *cs);
bool cs_needs_timeout(struct hl_cs *cs); bool cs_needs_timeout(struct hl_cs *cs);
bool is_staged_cs_last_exists(struct hl_device *hdev, struct hl_cs *cs); bool is_staged_cs_last_exists(struct hl_device *hdev, struct hl_cs *cs);
struct hl_cs *hl_staged_cs_find_first(struct hl_device *hdev, u64 cs_seq); struct hl_cs *hl_staged_cs_find_first(struct hl_device *hdev, u64 cs_seq);
void hl_multi_cs_completion_init(struct hl_device *hdev);
void goya_set_asic_funcs(struct hl_device *hdev); void goya_set_asic_funcs(struct hl_device *hdev);
void gaudi_set_asic_funcs(struct hl_device *hdev); void gaudi_set_asic_funcs(struct hl_device *hdev);
......
...@@ -603,6 +603,10 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs) ...@@ -603,6 +603,10 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
} }
list_add_tail(&cs->staged_cs_node, &staged_cs->staged_cs_node); list_add_tail(&cs->staged_cs_node, &staged_cs->staged_cs_node);
/* update stream map of the first CS */
if (hdev->supports_wait_for_multi_cs)
staged_cs->fence->stream_map |= cs->fence->stream_map;
} }
list_add_tail(&cs->mirror_node, &hdev->cs_mirror_list); list_add_tail(&cs->mirror_node, &hdev->cs_mirror_list);
......
...@@ -1814,6 +1814,7 @@ static int gaudi_sw_init(struct hl_device *hdev) ...@@ -1814,6 +1814,7 @@ static int gaudi_sw_init(struct hl_device *hdev)
hdev->supports_sync_stream = true; hdev->supports_sync_stream = true;
hdev->supports_coresight = true; hdev->supports_coresight = true;
hdev->supports_staged_submission = true; hdev->supports_staged_submission = true;
hdev->supports_wait_for_multi_cs = true;
gaudi_set_pci_memory_regions(hdev); gaudi_set_pci_memory_regions(hdev);
......
...@@ -958,6 +958,7 @@ static int goya_sw_init(struct hl_device *hdev) ...@@ -958,6 +958,7 @@ static int goya_sw_init(struct hl_device *hdev)
hdev->supports_coresight = true; hdev->supports_coresight = true;
hdev->supports_soft_reset = true; hdev->supports_soft_reset = true;
hdev->allow_external_soft_reset = true; hdev->allow_external_soft_reset = true;
hdev->supports_wait_for_multi_cs = false;
goya_set_pci_memory_regions(hdev); goya_set_pci_memory_regions(hdev);
......
...@@ -735,11 +735,18 @@ union hl_cs_args { ...@@ -735,11 +735,18 @@ union hl_cs_args {
#define HL_WAIT_CS_FLAGS_INTERRUPT 0x2 #define HL_WAIT_CS_FLAGS_INTERRUPT 0x2
#define HL_WAIT_CS_FLAGS_INTERRUPT_MASK 0xFFF00000 #define HL_WAIT_CS_FLAGS_INTERRUPT_MASK 0xFFF00000
#define HL_WAIT_CS_FLAGS_MULTI_CS 0x4
#define HL_WAIT_MULTI_CS_LIST_MAX_LEN 32
struct hl_wait_cs_in { struct hl_wait_cs_in {
union { union {
struct { struct {
/* Command submission sequence number */ /*
* In case of wait_cs holds the CS sequence number.
* In case of wait for multi CS hold a user pointer to
* an array of CS sequence numbers
*/
__u64 seq; __u64 seq;
/* Absolute timeout to wait for command submission /* Absolute timeout to wait for command submission
* in microseconds * in microseconds
...@@ -767,12 +774,17 @@ struct hl_wait_cs_in { ...@@ -767,12 +774,17 @@ struct hl_wait_cs_in {
/* Context ID - Currently not in use */ /* Context ID - Currently not in use */
__u32 ctx_id; __u32 ctx_id;
/* HL_WAIT_CS_FLAGS_* /* HL_WAIT_CS_FLAGS_*
* If HL_WAIT_CS_FLAGS_INTERRUPT is set, this field should include * If HL_WAIT_CS_FLAGS_INTERRUPT is set, this field should include
* interrupt id according to HL_WAIT_CS_FLAGS_INTERRUPT_MASK, in order * interrupt id according to HL_WAIT_CS_FLAGS_INTERRUPT_MASK, in order
* not to specify an interrupt id ,set mask to all 1s. * not to specify an interrupt id ,set mask to all 1s.
*/ */
__u32 flags; __u32 flags;
/* Multi CS API info- valid entries in multi-CS array */
__u8 seq_arr_len;
__u8 pad[7];
}; };
#define HL_WAIT_CS_STATUS_COMPLETED 0 #define HL_WAIT_CS_STATUS_COMPLETED 0
...@@ -789,8 +801,15 @@ struct hl_wait_cs_out { ...@@ -789,8 +801,15 @@ struct hl_wait_cs_out {
__u32 status; __u32 status;
/* HL_WAIT_CS_STATUS_FLAG* */ /* HL_WAIT_CS_STATUS_FLAG* */
__u32 flags; __u32 flags;
/* valid only if HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD is set */ /*
* valid only if HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD is set
* for wait_cs: timestamp of CS completion
* for wait_multi_cs: timestamp of FIRST CS completion
*/
__s64 timestamp_nsec; __s64 timestamp_nsec;
/* multi CS completion bitmap */
__u32 cs_completion_map;
__u32 pad;
}; };
union hl_wait_cs_args { union hl_wait_cs_args {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment