Commit f9e5f295 authored by Omer Shpigelman's avatar Omer Shpigelman Committed by Oded Gabbay

uapi: habanalabs: add signal/wait operations

This is a pre-requisite to upstreaming GAUDI support.

Signal/wait operations are done by the user to perform sync between two
Primary Queues (PQs). The sync is done using the sync manager and it is
usually resolved inside the device, but sometimes it can be resolved in the
host, i.e. the user should be able to wait in the host until a signal has
been completed.

The mechanism to define signal and wait operations is done by the driver
because it needs atomicity and serialization, which is already done in the
driver when submitting work to the different queues.

To implement this feature, the driver "takes" a couple of h/w resources,
and this is reflected by the defines added to the uapi file.

The signal/wait operations are done via the existing CS IOCTL, and they use
the same data structure. There is a difference in the meaning of some of
the parameters, and for that we added unions to make the code more
readable.
Signed-off-by: default avatarOmer Shpigelman <oshpigelman@habana.ai>
Reviewed-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
parent 824b4578
...@@ -11,6 +11,8 @@ ...@@ -11,6 +11,8 @@
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <linux/slab.h> #include <linux/slab.h>
#define HL_CS_FLAGS_SIG_WAIT (HL_CS_FLAGS_SIGNAL | HL_CS_FLAGS_WAIT)
static void job_wq_completion(struct work_struct *work); static void job_wq_completion(struct work_struct *work);
static long _hl_cs_wait_ioctl(struct hl_device *hdev, static long _hl_cs_wait_ioctl(struct hl_device *hdev,
struct hl_ctx *ctx, u64 timeout_us, u64 seq); struct hl_ctx *ctx, u64 timeout_us, u64 seq);
...@@ -659,7 +661,7 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) ...@@ -659,7 +661,7 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
union hl_cs_args *args = data; union hl_cs_args *args = data;
struct hl_ctx *ctx = hpriv->ctx; struct hl_ctx *ctx = hpriv->ctx;
void __user *chunks_execute, *chunks_restore; void __user *chunks_execute, *chunks_restore;
u32 num_chunks_execute, num_chunks_restore; u32 num_chunks_execute, num_chunks_restore, sig_wait_flags;
u64 cs_seq = ULONG_MAX; u64 cs_seq = ULONG_MAX;
int rc, do_ctx_switch; int rc, do_ctx_switch;
bool need_soft_reset = false; bool need_soft_reset = false;
...@@ -672,6 +674,15 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) ...@@ -672,6 +674,15 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
goto out; goto out;
} }
sig_wait_flags = args->in.cs_flags & HL_CS_FLAGS_SIG_WAIT;
if (unlikely((sig_wait_flags & HL_CS_FLAGS_SIG_WAIT) &&
(!hdev->supports_sync_stream))) {
dev_err(hdev->dev, "Sync stream CS is not supported\n");
rc = -EINVAL;
goto out;
}
chunks_execute = (void __user *) (uintptr_t) args->in.chunks_execute; chunks_execute = (void __user *) (uintptr_t) args->in.chunks_execute;
num_chunks_execute = args->in.num_chunks_execute; num_chunks_execute = args->in.num_chunks_execute;
......
...@@ -1347,6 +1347,7 @@ struct hl_device_idle_busy_ts { ...@@ -1347,6 +1347,7 @@ struct hl_device_idle_busy_ts {
* only to POWER9 machines. * only to POWER9 machines.
* @cdev_sysfs_created: were char devices and sysfs nodes created. * @cdev_sysfs_created: were char devices and sysfs nodes created.
* @stop_on_err: true if engines should stop on error. * @stop_on_err: true if engines should stop on error.
* @supports_sync_stream: is sync stream supported.
*/ */
struct hl_device { struct hl_device {
struct pci_dev *pdev; struct pci_dev *pdev;
...@@ -1429,6 +1430,7 @@ struct hl_device { ...@@ -1429,6 +1430,7 @@ struct hl_device {
u8 power9_64bit_dma_enable; u8 power9_64bit_dma_enable;
u8 cdev_sysfs_created; u8 cdev_sysfs_created;
u8 stop_on_err; u8 stop_on_err;
u8 supports_sync_stream;
/* Parameters for bring-up */ /* Parameters for bring-up */
u8 mmu_enable; u8 mmu_enable;
......
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
* *
* Copyright 2016-2019 HabanaLabs, Ltd. * Copyright 2016-2020 HabanaLabs, Ltd.
* All Rights Reserved. * All Rights Reserved.
* *
*/ */
...@@ -241,52 +241,87 @@ union hl_cb_args { ...@@ -241,52 +241,87 @@ union hl_cb_args {
* compatibility * compatibility
*/ */
struct hl_cs_chunk { struct hl_cs_chunk {
/* union {
* For external queue, this represents a Handle of CB on the Host /* For external queue, this represents a Handle of CB on the
* For internal queue, this represents an SRAM or DRAM address of the * Host.
* internal CB * For internal queue in Goya, this represents an SRAM or
* a DRAM address of the internal CB. In Gaudi, this might also
* represent a mapped host address of the CB.
*
* A mapped host address is in the device address space, after
* a host address was mapped by the device MMU.
*/ */
__u64 cb_handle; __u64 cb_handle;
/* Relevant only when HL_CS_FLAGS_WAIT is set.
* This holds address of array of u64 values that contain
* signal CS sequence numbers. The wait described by this job
* will listen on all those signals (wait event per signal)
*/
__u64 signal_seq_arr;
};
/* Index of queue to put the CB on */ /* Index of queue to put the CB on */
__u32 queue_index; __u32 queue_index;
union {
/* /*
* Size of command buffer with valid packets * Size of command buffer with valid packets
* Can be smaller then actual CB size * Can be smaller then actual CB size
*/ */
__u32 cb_size; __u32 cb_size;
/* Relevant only when HL_CS_FLAGS_WAIT is set.
* Number of entries in signal_seq_arr
*/
__u32 num_signal_seq_arr;
};
/* HL_CS_CHUNK_FLAGS_* */ /* HL_CS_CHUNK_FLAGS_* */
__u32 cs_chunk_flags; __u32 cs_chunk_flags;
/* Align structure to 64 bytes */ /* Align structure to 64 bytes */
__u32 pad[11]; __u32 pad[11];
}; };
/* SIGNAL and WAIT flags are mutually exclusive */
#define HL_CS_FLAGS_FORCE_RESTORE 0x1 #define HL_CS_FLAGS_FORCE_RESTORE 0x1
#define HL_CS_FLAGS_SIGNAL 0x2
#define HL_CS_FLAGS_WAIT 0x4
#define HL_CS_STATUS_SUCCESS 0 #define HL_CS_STATUS_SUCCESS 0
#define HL_MAX_JOBS_PER_CS 512 #define HL_MAX_JOBS_PER_CS 512
struct hl_cs_in { struct hl_cs_in {
/* this holds address of array of hl_cs_chunk for restore phase */ /* this holds address of array of hl_cs_chunk for restore phase */
__u64 chunks_restore; __u64 chunks_restore;
/* this holds address of array of hl_cs_chunk for execution phase */
/* holds address of array of hl_cs_chunk for execution phase */
__u64 chunks_execute; __u64 chunks_execute;
/* this holds address of array of hl_cs_chunk for store phase - /* this holds address of array of hl_cs_chunk for store phase -
* Currently not in use * Currently not in use
*/ */
__u64 chunks_store; __u64 chunks_store;
/* Number of chunks in restore phase array. Maximum number is /* Number of chunks in restore phase array. Maximum number is
* HL_MAX_JOBS_PER_CS * HL_MAX_JOBS_PER_CS
*/ */
__u32 num_chunks_restore; __u32 num_chunks_restore;
/* Number of chunks in execution array. Maximum number is /* Number of chunks in execution array. Maximum number is
* HL_MAX_JOBS_PER_CS * HL_MAX_JOBS_PER_CS
*/ */
__u32 num_chunks_execute; __u32 num_chunks_execute;
/* Number of chunks in restore phase array - Currently not in use */ /* Number of chunks in restore phase array - Currently not in use */
__u32 num_chunks_store; __u32 num_chunks_store;
/* HL_CS_FLAGS_* */ /* HL_CS_FLAGS_* */
__u32 cs_flags; __u32 cs_flags;
/* Context ID - Currently not in use */ /* Context ID - Currently not in use */
__u32 ctx_id; __u32 ctx_id;
}; };
...@@ -597,8 +632,8 @@ struct hl_debug_args { ...@@ -597,8 +632,8 @@ struct hl_debug_args {
* For jobs on external queues, the user needs to create command buffers * For jobs on external queues, the user needs to create command buffers
* through the CB ioctl and give the CB's handle to the CS ioctl. For jobs on * through the CB ioctl and give the CB's handle to the CS ioctl. For jobs on
* internal queues, the user needs to prepare a "command buffer" with packets * internal queues, the user needs to prepare a "command buffer" with packets
* on either the SRAM or DRAM, and give the device address of that buffer to * on either the device SRAM/DRAM or the host, and give the device address of
* the CS ioctl. * that buffer to the CS ioctl.
* *
* This IOCTL is asynchronous in regard to the actual execution of the CS. This * This IOCTL is asynchronous in regard to the actual execution of the CS. This
* means it returns immediately after ALL the JOBS were enqueued on their * means it returns immediately after ALL the JOBS were enqueued on their
...@@ -610,7 +645,7 @@ struct hl_debug_args { ...@@ -610,7 +645,7 @@ struct hl_debug_args {
* external JOBS have been completed. Note that if the CS has internal JOBS * external JOBS have been completed. Note that if the CS has internal JOBS
* which can execute AFTER the external JOBS have finished, the driver might * which can execute AFTER the external JOBS have finished, the driver might
* report that the CS has finished executing BEFORE the internal JOBS have * report that the CS has finished executing BEFORE the internal JOBS have
* actually finish executing. * actually finished executing.
* *
* Even though the sequence number increments per CS, the user can NOT * Even though the sequence number increments per CS, the user can NOT
* automatically assume that if CS with sequence number N finished, then CS * automatically assume that if CS with sequence number N finished, then CS
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment