Commit b78cda79 authored by Greg Kroah-Hartman's avatar Greg Kroah-Hartman

Merge tag 'misc-habanalabs-next-2019-11-21' of...

Merge tag 'misc-habanalabs-next-2019-11-21' of git://people.freedesktop.org/~gabbayo/linux into char-misc-next

Oded writes:

This tag contains the following changes for kernel 5.5:

- MMU code improvements that includes:
  - Distinguish between "normal" unmapping and unmapping that is done as
    part of the tear-down of a user process. This improves performance of
    unmapping during reset of the device.
  - Add future ASIC support in generic MMU code.

- Improve device reset code by adding more protection around accessing the
  device during the reset process.

- Add new H/W queue type for future ASIC support

- Add more information to be retrieved by users through INFO IOCTL:
  - clock rate
  - board name
  - reset counters

- Small bug fixes and minor improvements to code.

* tag 'misc-habanalabs-next-2019-11-21' of git://people.freedesktop.org/~gabbayo/linux: (31 commits)
  habanalabs: add more protection of device during reset
  habanalabs: flush EQ workers in hard reset
  habanalabs: make the reset code more consistent
  habanalabs: expose reset counters via existing INFO IOCTL
  habanalabs: make code more concise
  habanalabs: use defines for F/W files
  habanalabs: remove prints on successful device initialization
  habanalabs: remove unnecessary checks
  habanalabs: invalidate MMU cache only once
  habanalabs: skip VA block list update in reset flow
  habanalabs: optimize MMU unmap
  habanalabs: prevent read/write from/to the device during hard reset
  habanalabs: split MMU properties to PCI/DRAM
  habanalabs: re-factor MMU masks and documentation
  habanalabs: type specific MMU cache invalidation
  habanalabs: re-factor memory module code
  habanalabs: export uapi defines to user-space
  habanalabs: don't print error when queues are full
  habanalabs: increase max jobs number to 512
  habanalabs: set ETR as non-secured
  ...
parents 599ea01c 5feccddc
...@@ -65,6 +65,18 @@ static void cs_put(struct hl_cs *cs) ...@@ -65,6 +65,18 @@ static void cs_put(struct hl_cs *cs)
kref_put(&cs->refcount, cs_do_release); kref_put(&cs->refcount, cs_do_release);
} }
static bool is_cb_patched(struct hl_device *hdev, struct hl_cs_job *job)
{
/*
* Patched CB is created for external queues jobs, and for H/W queues
* jobs if the user CB was allocated by driver and MMU is disabled.
*/
return (job->queue_type == QUEUE_TYPE_EXT ||
(job->queue_type == QUEUE_TYPE_HW &&
job->is_kernel_allocated_cb &&
!hdev->mmu_enable));
}
/* /*
* cs_parser - parse the user command submission * cs_parser - parse the user command submission
* *
...@@ -91,11 +103,13 @@ static int cs_parser(struct hl_fpriv *hpriv, struct hl_cs_job *job) ...@@ -91,11 +103,13 @@ static int cs_parser(struct hl_fpriv *hpriv, struct hl_cs_job *job)
parser.patched_cb = NULL; parser.patched_cb = NULL;
parser.user_cb = job->user_cb; parser.user_cb = job->user_cb;
parser.user_cb_size = job->user_cb_size; parser.user_cb_size = job->user_cb_size;
parser.ext_queue = job->ext_queue; parser.queue_type = job->queue_type;
parser.is_kernel_allocated_cb = job->is_kernel_allocated_cb;
job->patched_cb = NULL; job->patched_cb = NULL;
rc = hdev->asic_funcs->cs_parser(hdev, &parser); rc = hdev->asic_funcs->cs_parser(hdev, &parser);
if (job->ext_queue) {
if (is_cb_patched(hdev, job)) {
if (!rc) { if (!rc) {
job->patched_cb = parser.patched_cb; job->patched_cb = parser.patched_cb;
job->job_cb_size = parser.patched_cb_size; job->job_cb_size = parser.patched_cb_size;
...@@ -124,7 +138,7 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job) ...@@ -124,7 +138,7 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
{ {
struct hl_cs *cs = job->cs; struct hl_cs *cs = job->cs;
if (job->ext_queue) { if (is_cb_patched(hdev, job)) {
hl_userptr_delete_list(hdev, &job->userptr_list); hl_userptr_delete_list(hdev, &job->userptr_list);
/* /*
...@@ -140,6 +154,19 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job) ...@@ -140,6 +154,19 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
} }
} }
/* For H/W queue jobs, if a user CB was allocated by driver and MMU is
* enabled, the user CB isn't released in cs_parser() and thus should be
* released here.
*/
if (job->queue_type == QUEUE_TYPE_HW &&
job->is_kernel_allocated_cb && hdev->mmu_enable) {
spin_lock(&job->user_cb->lock);
job->user_cb->cs_cnt--;
spin_unlock(&job->user_cb->lock);
hl_cb_put(job->user_cb);
}
/* /*
* This is the only place where there can be multiple threads * This is the only place where there can be multiple threads
* modifying the list at the same time * modifying the list at the same time
...@@ -150,7 +177,8 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job) ...@@ -150,7 +177,8 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
hl_debugfs_remove_job(hdev, job); hl_debugfs_remove_job(hdev, job);
if (job->ext_queue) if (job->queue_type == QUEUE_TYPE_EXT ||
job->queue_type == QUEUE_TYPE_HW)
cs_put(cs); cs_put(cs);
kfree(job); kfree(job);
...@@ -387,18 +415,13 @@ static void job_wq_completion(struct work_struct *work) ...@@ -387,18 +415,13 @@ static void job_wq_completion(struct work_struct *work)
free_job(hdev, job); free_job(hdev, job);
} }
static struct hl_cb *validate_queue_index(struct hl_device *hdev, static int validate_queue_index(struct hl_device *hdev,
struct hl_cb_mgr *cb_mgr,
struct hl_cs_chunk *chunk, struct hl_cs_chunk *chunk,
bool *ext_queue) enum hl_queue_type *queue_type,
bool *is_kernel_allocated_cb)
{ {
struct asic_fixed_properties *asic = &hdev->asic_prop; struct asic_fixed_properties *asic = &hdev->asic_prop;
struct hw_queue_properties *hw_queue_prop; struct hw_queue_properties *hw_queue_prop;
u32 cb_handle;
struct hl_cb *cb;
/* Assume external queue */
*ext_queue = true;
hw_queue_prop = &asic->hw_queues_props[chunk->queue_index]; hw_queue_prop = &asic->hw_queues_props[chunk->queue_index];
...@@ -406,20 +429,29 @@ static struct hl_cb *validate_queue_index(struct hl_device *hdev, ...@@ -406,20 +429,29 @@ static struct hl_cb *validate_queue_index(struct hl_device *hdev,
(hw_queue_prop->type == QUEUE_TYPE_NA)) { (hw_queue_prop->type == QUEUE_TYPE_NA)) {
dev_err(hdev->dev, "Queue index %d is invalid\n", dev_err(hdev->dev, "Queue index %d is invalid\n",
chunk->queue_index); chunk->queue_index);
return NULL; return -EINVAL;
} }
if (hw_queue_prop->driver_only) { if (hw_queue_prop->driver_only) {
dev_err(hdev->dev, dev_err(hdev->dev,
"Queue index %d is restricted for the kernel driver\n", "Queue index %d is restricted for the kernel driver\n",
chunk->queue_index); chunk->queue_index);
return NULL; return -EINVAL;
} else if (hw_queue_prop->type == QUEUE_TYPE_INT) {
*ext_queue = false;
return (struct hl_cb *) (uintptr_t) chunk->cb_handle;
} }
/* Retrieve CB object */ *queue_type = hw_queue_prop->type;
*is_kernel_allocated_cb = !!hw_queue_prop->requires_kernel_cb;
return 0;
}
static struct hl_cb *get_cb_from_cs_chunk(struct hl_device *hdev,
struct hl_cb_mgr *cb_mgr,
struct hl_cs_chunk *chunk)
{
struct hl_cb *cb;
u32 cb_handle;
cb_handle = (u32) (chunk->cb_handle >> PAGE_SHIFT); cb_handle = (u32) (chunk->cb_handle >> PAGE_SHIFT);
cb = hl_cb_get(hdev, cb_mgr, cb_handle); cb = hl_cb_get(hdev, cb_mgr, cb_handle);
...@@ -444,7 +476,8 @@ static struct hl_cb *validate_queue_index(struct hl_device *hdev, ...@@ -444,7 +476,8 @@ static struct hl_cb *validate_queue_index(struct hl_device *hdev,
return NULL; return NULL;
} }
struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue) struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
enum hl_queue_type queue_type, bool is_kernel_allocated_cb)
{ {
struct hl_cs_job *job; struct hl_cs_job *job;
...@@ -452,12 +485,14 @@ struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue) ...@@ -452,12 +485,14 @@ struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue)
if (!job) if (!job)
return NULL; return NULL;
job->ext_queue = ext_queue; job->queue_type = queue_type;
job->is_kernel_allocated_cb = is_kernel_allocated_cb;
if (job->ext_queue) { if (is_cb_patched(hdev, job))
INIT_LIST_HEAD(&job->userptr_list); INIT_LIST_HEAD(&job->userptr_list);
if (job->queue_type == QUEUE_TYPE_EXT)
INIT_WORK(&job->finish_work, job_wq_completion); INIT_WORK(&job->finish_work, job_wq_completion);
}
return job; return job;
} }
...@@ -470,7 +505,7 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks, ...@@ -470,7 +505,7 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
struct hl_cs_job *job; struct hl_cs_job *job;
struct hl_cs *cs; struct hl_cs *cs;
struct hl_cb *cb; struct hl_cb *cb;
bool ext_queue_present = false; bool int_queues_only = true;
u32 size_to_copy; u32 size_to_copy;
int rc, i, parse_cnt; int rc, i, parse_cnt;
...@@ -514,23 +549,33 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks, ...@@ -514,23 +549,33 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
/* Validate ALL the CS chunks before submitting the CS */ /* Validate ALL the CS chunks before submitting the CS */
for (i = 0, parse_cnt = 0 ; i < num_chunks ; i++, parse_cnt++) { for (i = 0, parse_cnt = 0 ; i < num_chunks ; i++, parse_cnt++) {
struct hl_cs_chunk *chunk = &cs_chunk_array[i]; struct hl_cs_chunk *chunk = &cs_chunk_array[i];
bool ext_queue; enum hl_queue_type queue_type;
bool is_kernel_allocated_cb;
cb = validate_queue_index(hdev, &hpriv->cb_mgr, chunk, rc = validate_queue_index(hdev, chunk, &queue_type,
&ext_queue); &is_kernel_allocated_cb);
if (ext_queue) { if (rc)
ext_queue_present = true; goto free_cs_object;
if (is_kernel_allocated_cb) {
cb = get_cb_from_cs_chunk(hdev, &hpriv->cb_mgr, chunk);
if (!cb) { if (!cb) {
rc = -EINVAL; rc = -EINVAL;
goto free_cs_object; goto free_cs_object;
} }
} else {
cb = (struct hl_cb *) (uintptr_t) chunk->cb_handle;
} }
job = hl_cs_allocate_job(hdev, ext_queue); if (queue_type == QUEUE_TYPE_EXT || queue_type == QUEUE_TYPE_HW)
int_queues_only = false;
job = hl_cs_allocate_job(hdev, queue_type,
is_kernel_allocated_cb);
if (!job) { if (!job) {
dev_err(hdev->dev, "Failed to allocate a new job\n"); dev_err(hdev->dev, "Failed to allocate a new job\n");
rc = -ENOMEM; rc = -ENOMEM;
if (ext_queue) if (is_kernel_allocated_cb)
goto release_cb; goto release_cb;
else else
goto free_cs_object; goto free_cs_object;
...@@ -540,7 +585,7 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks, ...@@ -540,7 +585,7 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
job->cs = cs; job->cs = cs;
job->user_cb = cb; job->user_cb = cb;
job->user_cb_size = chunk->cb_size; job->user_cb_size = chunk->cb_size;
if (job->ext_queue) if (is_kernel_allocated_cb)
job->job_cb_size = cb->size; job->job_cb_size = cb->size;
else else
job->job_cb_size = chunk->cb_size; job->job_cb_size = chunk->cb_size;
...@@ -553,10 +598,11 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks, ...@@ -553,10 +598,11 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
/* /*
* Increment CS reference. When CS reference is 0, CS is * Increment CS reference. When CS reference is 0, CS is
* done and can be signaled to user and free all its resources * done and can be signaled to user and free all its resources
* Only increment for JOB on external queues, because only * Only increment for JOB on external or H/W queues, because
* for those JOBs we get completion * only for those JOBs we get completion
*/ */
if (job->ext_queue) if (job->queue_type == QUEUE_TYPE_EXT ||
job->queue_type == QUEUE_TYPE_HW)
cs_get(cs); cs_get(cs);
hl_debugfs_add_job(hdev, job); hl_debugfs_add_job(hdev, job);
...@@ -570,9 +616,9 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks, ...@@ -570,9 +616,9 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
} }
} }
if (!ext_queue_present) { if (int_queues_only) {
dev_err(hdev->dev, dev_err(hdev->dev,
"Reject CS %d.%llu because no external queues jobs\n", "Reject CS %d.%llu because only internal queues jobs are present\n",
cs->ctx->asid, cs->sequence); cs->ctx->asid, cs->sequence);
rc = -EINVAL; rc = -EINVAL;
goto free_cs_object; goto free_cs_object;
...@@ -580,6 +626,7 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks, ...@@ -580,6 +626,7 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
rc = hl_hw_queue_schedule_cs(cs); rc = hl_hw_queue_schedule_cs(cs);
if (rc) { if (rc) {
if (rc != -EAGAIN)
dev_err(hdev->dev, dev_err(hdev->dev,
"Failed to submit CS %d.%llu to H/W queues, error %d\n", "Failed to submit CS %d.%llu to H/W queues, error %d\n",
cs->ctx->asid, cs->sequence, rc); cs->ctx->asid, cs->sequence, rc);
......
...@@ -307,45 +307,57 @@ static inline u64 get_hop0_addr(struct hl_ctx *ctx) ...@@ -307,45 +307,57 @@ static inline u64 get_hop0_addr(struct hl_ctx *ctx)
(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size); (ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
} }
static inline u64 get_hop0_pte_addr(struct hl_ctx *ctx, u64 hop_addr, static inline u64 get_hopN_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
u64 virt_addr) u64 virt_addr, u64 mask, u64 shift)
{ {
return hop_addr + ctx->hdev->asic_prop.mmu_pte_size * return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
((virt_addr & HOP0_MASK) >> HOP0_SHIFT); ((virt_addr & mask) >> shift);
} }
static inline u64 get_hop1_pte_addr(struct hl_ctx *ctx, u64 hop_addr, static inline u64 get_hop0_pte_addr(struct hl_ctx *ctx,
u64 virt_addr) struct hl_mmu_properties *mmu_specs,
u64 hop_addr, u64 vaddr)
{ {
return hop_addr + ctx->hdev->asic_prop.mmu_pte_size * return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop0_mask,
((virt_addr & HOP1_MASK) >> HOP1_SHIFT); mmu_specs->hop0_shift);
} }
static inline u64 get_hop2_pte_addr(struct hl_ctx *ctx, u64 hop_addr, static inline u64 get_hop1_pte_addr(struct hl_ctx *ctx,
u64 virt_addr) struct hl_mmu_properties *mmu_specs,
u64 hop_addr, u64 vaddr)
{ {
return hop_addr + ctx->hdev->asic_prop.mmu_pte_size * return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop1_mask,
((virt_addr & HOP2_MASK) >> HOP2_SHIFT); mmu_specs->hop1_shift);
} }
static inline u64 get_hop3_pte_addr(struct hl_ctx *ctx, u64 hop_addr, static inline u64 get_hop2_pte_addr(struct hl_ctx *ctx,
u64 virt_addr) struct hl_mmu_properties *mmu_specs,
u64 hop_addr, u64 vaddr)
{ {
return hop_addr + ctx->hdev->asic_prop.mmu_pte_size * return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop2_mask,
((virt_addr & HOP3_MASK) >> HOP3_SHIFT); mmu_specs->hop2_shift);
} }
static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx, u64 hop_addr, static inline u64 get_hop3_pte_addr(struct hl_ctx *ctx,
u64 virt_addr) struct hl_mmu_properties *mmu_specs,
u64 hop_addr, u64 vaddr)
{ {
return hop_addr + ctx->hdev->asic_prop.mmu_pte_size * return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop3_mask,
((virt_addr & HOP4_MASK) >> HOP4_SHIFT); mmu_specs->hop3_shift);
}
static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx,
struct hl_mmu_properties *mmu_specs,
u64 hop_addr, u64 vaddr)
{
return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop4_mask,
mmu_specs->hop4_shift);
} }
static inline u64 get_next_hop_addr(u64 curr_pte) static inline u64 get_next_hop_addr(u64 curr_pte)
{ {
if (curr_pte & PAGE_PRESENT_MASK) if (curr_pte & PAGE_PRESENT_MASK)
return curr_pte & PHYS_ADDR_MASK; return curr_pte & HOP_PHYS_ADDR_MASK;
else else
return ULLONG_MAX; return ULLONG_MAX;
} }
...@@ -355,7 +367,10 @@ static int mmu_show(struct seq_file *s, void *data) ...@@ -355,7 +367,10 @@ static int mmu_show(struct seq_file *s, void *data)
struct hl_debugfs_entry *entry = s->private; struct hl_debugfs_entry *entry = s->private;
struct hl_dbg_device_entry *dev_entry = entry->dev_entry; struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
struct hl_device *hdev = dev_entry->hdev; struct hl_device *hdev = dev_entry->hdev;
struct asic_fixed_properties *prop = &hdev->asic_prop;
struct hl_mmu_properties *mmu_prop;
struct hl_ctx *ctx; struct hl_ctx *ctx;
bool is_dram_addr;
u64 hop0_addr = 0, hop0_pte_addr = 0, hop0_pte = 0, u64 hop0_addr = 0, hop0_pte_addr = 0, hop0_pte = 0,
hop1_addr = 0, hop1_pte_addr = 0, hop1_pte = 0, hop1_addr = 0, hop1_pte_addr = 0, hop1_pte = 0,
...@@ -377,33 +392,39 @@ static int mmu_show(struct seq_file *s, void *data) ...@@ -377,33 +392,39 @@ static int mmu_show(struct seq_file *s, void *data)
return 0; return 0;
} }
is_dram_addr = hl_mem_area_inside_range(virt_addr, prop->dmmu.page_size,
prop->va_space_dram_start_address,
prop->va_space_dram_end_address);
mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu;
mutex_lock(&ctx->mmu_lock); mutex_lock(&ctx->mmu_lock);
/* the following lookup is copied from unmap() in mmu.c */ /* the following lookup is copied from unmap() in mmu.c */
hop0_addr = get_hop0_addr(ctx); hop0_addr = get_hop0_addr(ctx);
hop0_pte_addr = get_hop0_pte_addr(ctx, hop0_addr, virt_addr); hop0_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop0_addr, virt_addr);
hop0_pte = hdev->asic_funcs->read_pte(hdev, hop0_pte_addr); hop0_pte = hdev->asic_funcs->read_pte(hdev, hop0_pte_addr);
hop1_addr = get_next_hop_addr(hop0_pte); hop1_addr = get_next_hop_addr(hop0_pte);
if (hop1_addr == ULLONG_MAX) if (hop1_addr == ULLONG_MAX)
goto not_mapped; goto not_mapped;
hop1_pte_addr = get_hop1_pte_addr(ctx, hop1_addr, virt_addr); hop1_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop1_addr, virt_addr);
hop1_pte = hdev->asic_funcs->read_pte(hdev, hop1_pte_addr); hop1_pte = hdev->asic_funcs->read_pte(hdev, hop1_pte_addr);
hop2_addr = get_next_hop_addr(hop1_pte); hop2_addr = get_next_hop_addr(hop1_pte);
if (hop2_addr == ULLONG_MAX) if (hop2_addr == ULLONG_MAX)
goto not_mapped; goto not_mapped;
hop2_pte_addr = get_hop2_pte_addr(ctx, hop2_addr, virt_addr); hop2_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop2_addr, virt_addr);
hop2_pte = hdev->asic_funcs->read_pte(hdev, hop2_pte_addr); hop2_pte = hdev->asic_funcs->read_pte(hdev, hop2_pte_addr);
hop3_addr = get_next_hop_addr(hop2_pte); hop3_addr = get_next_hop_addr(hop2_pte);
if (hop3_addr == ULLONG_MAX) if (hop3_addr == ULLONG_MAX)
goto not_mapped; goto not_mapped;
hop3_pte_addr = get_hop3_pte_addr(ctx, hop3_addr, virt_addr); hop3_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop3_addr, virt_addr);
hop3_pte = hdev->asic_funcs->read_pte(hdev, hop3_pte_addr); hop3_pte = hdev->asic_funcs->read_pte(hdev, hop3_pte_addr);
if (!(hop3_pte & LAST_MASK)) { if (!(hop3_pte & LAST_MASK)) {
...@@ -412,7 +433,8 @@ static int mmu_show(struct seq_file *s, void *data) ...@@ -412,7 +433,8 @@ static int mmu_show(struct seq_file *s, void *data)
if (hop4_addr == ULLONG_MAX) if (hop4_addr == ULLONG_MAX)
goto not_mapped; goto not_mapped;
hop4_pte_addr = get_hop4_pte_addr(ctx, hop4_addr, virt_addr); hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop4_addr,
virt_addr);
hop4_pte = hdev->asic_funcs->read_pte(hdev, hop4_pte_addr); hop4_pte = hdev->asic_funcs->read_pte(hdev, hop4_pte_addr);
if (!(hop4_pte & PAGE_PRESENT_MASK)) if (!(hop4_pte & PAGE_PRESENT_MASK))
goto not_mapped; goto not_mapped;
...@@ -506,6 +528,12 @@ static int engines_show(struct seq_file *s, void *data) ...@@ -506,6 +528,12 @@ static int engines_show(struct seq_file *s, void *data)
struct hl_dbg_device_entry *dev_entry = entry->dev_entry; struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
struct hl_device *hdev = dev_entry->hdev; struct hl_device *hdev = dev_entry->hdev;
if (atomic_read(&hdev->in_reset)) {
dev_warn_ratelimited(hdev->dev,
"Can't check device idle during reset\n");
return 0;
}
hdev->asic_funcs->is_device_idle(hdev, NULL, s); hdev->asic_funcs->is_device_idle(hdev, NULL, s);
return 0; return 0;
...@@ -534,41 +562,50 @@ static int device_va_to_pa(struct hl_device *hdev, u64 virt_addr, ...@@ -534,41 +562,50 @@ static int device_va_to_pa(struct hl_device *hdev, u64 virt_addr,
u64 *phys_addr) u64 *phys_addr)
{ {
struct hl_ctx *ctx = hdev->compute_ctx; struct hl_ctx *ctx = hdev->compute_ctx;
struct asic_fixed_properties *prop = &hdev->asic_prop;
struct hl_mmu_properties *mmu_prop;
u64 hop_addr, hop_pte_addr, hop_pte; u64 hop_addr, hop_pte_addr, hop_pte;
u64 offset_mask = HOP4_MASK | OFFSET_MASK; u64 offset_mask = HOP4_MASK | FLAGS_MASK;
int rc = 0; int rc = 0;
bool is_dram_addr;
if (!ctx) { if (!ctx) {
dev_err(hdev->dev, "no ctx available\n"); dev_err(hdev->dev, "no ctx available\n");
return -EINVAL; return -EINVAL;
} }
is_dram_addr = hl_mem_area_inside_range(virt_addr, prop->dmmu.page_size,
prop->va_space_dram_start_address,
prop->va_space_dram_end_address);
mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu;
mutex_lock(&ctx->mmu_lock); mutex_lock(&ctx->mmu_lock);
/* hop 0 */ /* hop 0 */
hop_addr = get_hop0_addr(ctx); hop_addr = get_hop0_addr(ctx);
hop_pte_addr = get_hop0_pte_addr(ctx, hop_addr, virt_addr); hop_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop_addr, virt_addr);
hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr); hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr);
/* hop 1 */ /* hop 1 */
hop_addr = get_next_hop_addr(hop_pte); hop_addr = get_next_hop_addr(hop_pte);
if (hop_addr == ULLONG_MAX) if (hop_addr == ULLONG_MAX)
goto not_mapped; goto not_mapped;
hop_pte_addr = get_hop1_pte_addr(ctx, hop_addr, virt_addr); hop_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop_addr, virt_addr);
hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr); hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr);
/* hop 2 */ /* hop 2 */
hop_addr = get_next_hop_addr(hop_pte); hop_addr = get_next_hop_addr(hop_pte);
if (hop_addr == ULLONG_MAX) if (hop_addr == ULLONG_MAX)
goto not_mapped; goto not_mapped;
hop_pte_addr = get_hop2_pte_addr(ctx, hop_addr, virt_addr); hop_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop_addr, virt_addr);
hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr); hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr);
/* hop 3 */ /* hop 3 */
hop_addr = get_next_hop_addr(hop_pte); hop_addr = get_next_hop_addr(hop_pte);
if (hop_addr == ULLONG_MAX) if (hop_addr == ULLONG_MAX)
goto not_mapped; goto not_mapped;
hop_pte_addr = get_hop3_pte_addr(ctx, hop_addr, virt_addr); hop_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop_addr, virt_addr);
hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr); hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr);
if (!(hop_pte & LAST_MASK)) { if (!(hop_pte & LAST_MASK)) {
...@@ -576,10 +613,11 @@ static int device_va_to_pa(struct hl_device *hdev, u64 virt_addr, ...@@ -576,10 +613,11 @@ static int device_va_to_pa(struct hl_device *hdev, u64 virt_addr,
hop_addr = get_next_hop_addr(hop_pte); hop_addr = get_next_hop_addr(hop_pte);
if (hop_addr == ULLONG_MAX) if (hop_addr == ULLONG_MAX)
goto not_mapped; goto not_mapped;
hop_pte_addr = get_hop4_pte_addr(ctx, hop_addr, virt_addr); hop_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop_addr,
virt_addr);
hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr); hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr);
offset_mask = OFFSET_MASK; offset_mask = FLAGS_MASK;
} }
if (!(hop_pte & PAGE_PRESENT_MASK)) if (!(hop_pte & PAGE_PRESENT_MASK))
...@@ -608,6 +646,11 @@ static ssize_t hl_data_read32(struct file *f, char __user *buf, ...@@ -608,6 +646,11 @@ static ssize_t hl_data_read32(struct file *f, char __user *buf,
u32 val; u32 val;
ssize_t rc; ssize_t rc;
if (atomic_read(&hdev->in_reset)) {
dev_warn_ratelimited(hdev->dev, "Can't read during reset\n");
return 0;
}
if (*ppos) if (*ppos)
return 0; return 0;
...@@ -637,6 +680,11 @@ static ssize_t hl_data_write32(struct file *f, const char __user *buf, ...@@ -637,6 +680,11 @@ static ssize_t hl_data_write32(struct file *f, const char __user *buf,
u32 value; u32 value;
ssize_t rc; ssize_t rc;
if (atomic_read(&hdev->in_reset)) {
dev_warn_ratelimited(hdev->dev, "Can't write during reset\n");
return 0;
}
rc = kstrtouint_from_user(buf, count, 16, &value); rc = kstrtouint_from_user(buf, count, 16, &value);
if (rc) if (rc)
return rc; return rc;
......
...@@ -42,12 +42,10 @@ static void hpriv_release(struct kref *ref) ...@@ -42,12 +42,10 @@ static void hpriv_release(struct kref *ref)
{ {
struct hl_fpriv *hpriv; struct hl_fpriv *hpriv;
struct hl_device *hdev; struct hl_device *hdev;
struct hl_ctx *ctx;
hpriv = container_of(ref, struct hl_fpriv, refcount); hpriv = container_of(ref, struct hl_fpriv, refcount);
hdev = hpriv->hdev; hdev = hpriv->hdev;
ctx = hpriv->ctx;
put_pid(hpriv->taskpid); put_pid(hpriv->taskpid);
...@@ -889,13 +887,19 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset, ...@@ -889,13 +887,19 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
/* Go over all the queues, release all CS and their jobs */ /* Go over all the queues, release all CS and their jobs */
hl_cs_rollback_all(hdev); hl_cs_rollback_all(hdev);
/* Kill processes here after CS rollback. This is because the process if (hard_reset) {
* can't really exit until all its CSs are done, which is what we /* Kill processes here after CS rollback. This is because the
* do in cs rollback * process can't really exit until all its CSs are done, which
* is what we do in cs rollback
*/ */
if (from_hard_reset_thread)
device_kill_open_processes(hdev); device_kill_open_processes(hdev);
/* Flush the Event queue workers to make sure no other thread is
* reading or writing to registers during the reset
*/
flush_workqueue(hdev->eq_wq);
}
/* Release kernel context */ /* Release kernel context */
if ((hard_reset) && (hl_ctx_put(hdev->kernel_ctx) == 1)) if ((hard_reset) && (hl_ctx_put(hdev->kernel_ctx) == 1))
hdev->kernel_ctx = NULL; hdev->kernel_ctx = NULL;
......
...@@ -143,10 +143,7 @@ int hl_fw_test_cpu_queue(struct hl_device *hdev) ...@@ -143,10 +143,7 @@ int hl_fw_test_cpu_queue(struct hl_device *hdev)
sizeof(test_pkt), HL_DEVICE_TIMEOUT_USEC, &result); sizeof(test_pkt), HL_DEVICE_TIMEOUT_USEC, &result);
if (!rc) { if (!rc) {
if (result == ARMCP_PACKET_FENCE_VAL) if (result != ARMCP_PACKET_FENCE_VAL)
dev_info(hdev->dev,
"queue test on CPU queue succeeded\n");
else
dev_err(hdev->dev, dev_err(hdev->dev,
"CPU queue test failed (0x%08lX)\n", result); "CPU queue test failed (0x%08lX)\n", result);
} else { } else {
......
...@@ -72,6 +72,9 @@ ...@@ -72,6 +72,9 @@
* *
*/ */
#define GOYA_UBOOT_FW_FILE "habanalabs/goya/goya-u-boot.bin"
#define GOYA_LINUX_FW_FILE "habanalabs/goya/goya-fit.itb"
#define GOYA_MMU_REGS_NUM 63 #define GOYA_MMU_REGS_NUM 63
#define GOYA_DMA_POOL_BLK_SIZE 0x100 /* 256 bytes */ #define GOYA_DMA_POOL_BLK_SIZE 0x100 /* 256 bytes */
...@@ -337,17 +340,20 @@ void goya_get_fixed_properties(struct hl_device *hdev) ...@@ -337,17 +340,20 @@ void goya_get_fixed_properties(struct hl_device *hdev)
for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++) { for (i = 0 ; i < NUMBER_OF_EXT_HW_QUEUES ; i++) {
prop->hw_queues_props[i].type = QUEUE_TYPE_EXT; prop->hw_queues_props[i].type = QUEUE_TYPE_EXT;
prop->hw_queues_props[i].driver_only = 0; prop->hw_queues_props[i].driver_only = 0;
prop->hw_queues_props[i].requires_kernel_cb = 1;
} }
for (; i < NUMBER_OF_EXT_HW_QUEUES + NUMBER_OF_CPU_HW_QUEUES ; i++) { for (; i < NUMBER_OF_EXT_HW_QUEUES + NUMBER_OF_CPU_HW_QUEUES ; i++) {
prop->hw_queues_props[i].type = QUEUE_TYPE_CPU; prop->hw_queues_props[i].type = QUEUE_TYPE_CPU;
prop->hw_queues_props[i].driver_only = 1; prop->hw_queues_props[i].driver_only = 1;
prop->hw_queues_props[i].requires_kernel_cb = 0;
} }
for (; i < NUMBER_OF_EXT_HW_QUEUES + NUMBER_OF_CPU_HW_QUEUES + for (; i < NUMBER_OF_EXT_HW_QUEUES + NUMBER_OF_CPU_HW_QUEUES +
NUMBER_OF_INT_HW_QUEUES; i++) { NUMBER_OF_INT_HW_QUEUES; i++) {
prop->hw_queues_props[i].type = QUEUE_TYPE_INT; prop->hw_queues_props[i].type = QUEUE_TYPE_INT;
prop->hw_queues_props[i].driver_only = 0; prop->hw_queues_props[i].driver_only = 0;
prop->hw_queues_props[i].requires_kernel_cb = 0;
} }
for (; i < HL_MAX_QUEUES; i++) for (; i < HL_MAX_QUEUES; i++)
...@@ -377,6 +383,23 @@ void goya_get_fixed_properties(struct hl_device *hdev) ...@@ -377,6 +383,23 @@ void goya_get_fixed_properties(struct hl_device *hdev)
prop->mmu_hop0_tables_total_size = HOP0_TABLES_TOTAL_SIZE; prop->mmu_hop0_tables_total_size = HOP0_TABLES_TOTAL_SIZE;
prop->dram_page_size = PAGE_SIZE_2MB; prop->dram_page_size = PAGE_SIZE_2MB;
prop->dmmu.hop0_shift = HOP0_SHIFT;
prop->dmmu.hop1_shift = HOP1_SHIFT;
prop->dmmu.hop2_shift = HOP2_SHIFT;
prop->dmmu.hop3_shift = HOP3_SHIFT;
prop->dmmu.hop4_shift = HOP4_SHIFT;
prop->dmmu.hop0_mask = HOP0_MASK;
prop->dmmu.hop1_mask = HOP1_MASK;
prop->dmmu.hop2_mask = HOP2_MASK;
prop->dmmu.hop3_mask = HOP3_MASK;
prop->dmmu.hop4_mask = HOP4_MASK;
prop->dmmu.huge_page_size = PAGE_SIZE_2MB;
/* No difference between PMMU and DMMU except of page size */
memcpy(&prop->pmmu, &prop->dmmu, sizeof(prop->dmmu));
prop->dmmu.page_size = PAGE_SIZE_2MB;
prop->pmmu.page_size = PAGE_SIZE_4KB;
prop->va_space_host_start_address = VA_HOST_SPACE_START; prop->va_space_host_start_address = VA_HOST_SPACE_START;
prop->va_space_host_end_address = VA_HOST_SPACE_END; prop->va_space_host_end_address = VA_HOST_SPACE_END;
prop->va_space_dram_start_address = VA_DDR_SPACE_START; prop->va_space_dram_start_address = VA_DDR_SPACE_START;
...@@ -393,6 +416,9 @@ void goya_get_fixed_properties(struct hl_device *hdev) ...@@ -393,6 +416,9 @@ void goya_get_fixed_properties(struct hl_device *hdev)
prop->tpc_enabled_mask = TPC_ENABLED_MASK; prop->tpc_enabled_mask = TPC_ENABLED_MASK;
prop->pcie_dbi_base_address = mmPCIE_DBI_BASE; prop->pcie_dbi_base_address = mmPCIE_DBI_BASE;
prop->pcie_aux_dbi_reg_addr = CFG_BASE + mmPCIE_AUX_DBI; prop->pcie_aux_dbi_reg_addr = CFG_BASE + mmPCIE_AUX_DBI;
strncpy(prop->armcp_info.card_name, GOYA_DEFAULT_CARD_NAME,
CARD_NAME_MAX_LEN);
} }
/* /*
...@@ -1454,6 +1480,9 @@ static void goya_init_golden_registers(struct hl_device *hdev) ...@@ -1454,6 +1480,9 @@ static void goya_init_golden_registers(struct hl_device *hdev)
1 << TPC0_NRTR_SCRAMB_EN_VAL_SHIFT); 1 << TPC0_NRTR_SCRAMB_EN_VAL_SHIFT);
WREG32(mmTPC0_NRTR_NON_LIN_SCRAMB + offset, WREG32(mmTPC0_NRTR_NON_LIN_SCRAMB + offset,
1 << TPC0_NRTR_NON_LIN_SCRAMB_EN_SHIFT); 1 << TPC0_NRTR_NON_LIN_SCRAMB_EN_SHIFT);
WREG32_FIELD(TPC0_CFG_MSS_CONFIG, offset,
ICACHE_FETCH_LINE_NUM, 2);
} }
WREG32(mmDMA_NRTR_SCRAMB_EN, 1 << DMA_NRTR_SCRAMB_EN_VAL_SHIFT); WREG32(mmDMA_NRTR_SCRAMB_EN, 1 << DMA_NRTR_SCRAMB_EN_VAL_SHIFT);
...@@ -1533,7 +1562,6 @@ static void goya_init_mme_cmdq(struct hl_device *hdev) ...@@ -1533,7 +1562,6 @@ static void goya_init_mme_cmdq(struct hl_device *hdev)
u32 mtr_base_lo, mtr_base_hi; u32 mtr_base_lo, mtr_base_hi;
u32 so_base_lo, so_base_hi; u32 so_base_lo, so_base_hi;
u32 gic_base_lo, gic_base_hi; u32 gic_base_lo, gic_base_hi;
u64 qman_base_addr;
mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0); mtr_base_lo = lower_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0); mtr_base_hi = upper_32_bits(CFG_BASE + mmSYNC_MNGR_MON_PAY_ADDRL_0);
...@@ -1545,9 +1573,6 @@ static void goya_init_mme_cmdq(struct hl_device *hdev) ...@@ -1545,9 +1573,6 @@ static void goya_init_mme_cmdq(struct hl_device *hdev)
gic_base_hi = gic_base_hi =
upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR); upper_32_bits(CFG_BASE + mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR);
qman_base_addr = hdev->asic_prop.sram_base_address +
MME_QMAN_BASE_OFFSET;
WREG32(mmMME_CMDQ_CP_MSG_BASE0_ADDR_LO, mtr_base_lo); WREG32(mmMME_CMDQ_CP_MSG_BASE0_ADDR_LO, mtr_base_lo);
WREG32(mmMME_CMDQ_CP_MSG_BASE0_ADDR_HI, mtr_base_hi); WREG32(mmMME_CMDQ_CP_MSG_BASE0_ADDR_HI, mtr_base_hi);
WREG32(mmMME_CMDQ_CP_MSG_BASE1_ADDR_LO, so_base_lo); WREG32(mmMME_CMDQ_CP_MSG_BASE1_ADDR_LO, so_base_lo);
...@@ -2141,13 +2166,11 @@ static void goya_halt_engines(struct hl_device *hdev, bool hard_reset) ...@@ -2141,13 +2166,11 @@ static void goya_halt_engines(struct hl_device *hdev, bool hard_reset)
*/ */
static int goya_push_uboot_to_device(struct hl_device *hdev) static int goya_push_uboot_to_device(struct hl_device *hdev)
{ {
char fw_name[200];
void __iomem *dst; void __iomem *dst;
snprintf(fw_name, sizeof(fw_name), "habanalabs/goya/goya-u-boot.bin");
dst = hdev->pcie_bar[SRAM_CFG_BAR_ID] + UBOOT_FW_OFFSET; dst = hdev->pcie_bar[SRAM_CFG_BAR_ID] + UBOOT_FW_OFFSET;
return hl_fw_push_fw_to_device(hdev, fw_name, dst); return hl_fw_push_fw_to_device(hdev, GOYA_UBOOT_FW_FILE, dst);
} }
/* /*
...@@ -2160,13 +2183,11 @@ static int goya_push_uboot_to_device(struct hl_device *hdev) ...@@ -2160,13 +2183,11 @@ static int goya_push_uboot_to_device(struct hl_device *hdev)
*/ */
static int goya_push_linux_to_device(struct hl_device *hdev) static int goya_push_linux_to_device(struct hl_device *hdev)
{ {
char fw_name[200];
void __iomem *dst; void __iomem *dst;
snprintf(fw_name, sizeof(fw_name), "habanalabs/goya/goya-fit.itb");
dst = hdev->pcie_bar[DDR_BAR_ID] + LINUX_FW_OFFSET; dst = hdev->pcie_bar[DDR_BAR_ID] + LINUX_FW_OFFSET;
return hl_fw_push_fw_to_device(hdev, fw_name, dst); return hl_fw_push_fw_to_device(hdev, GOYA_LINUX_FW_FILE, dst);
} }
static int goya_pldm_init_cpu(struct hl_device *hdev) static int goya_pldm_init_cpu(struct hl_device *hdev)
...@@ -2291,6 +2312,10 @@ static int goya_init_cpu(struct hl_device *hdev, u32 cpu_timeout) ...@@ -2291,6 +2312,10 @@ static int goya_init_cpu(struct hl_device *hdev, u32 cpu_timeout)
10000, 10000,
cpu_timeout); cpu_timeout);
/* Read U-Boot version now in case we will later fail */
goya_read_device_fw_version(hdev, FW_COMP_UBOOT);
goya_read_device_fw_version(hdev, FW_COMP_PREBOOT);
if (rc) { if (rc) {
dev_err(hdev->dev, "Error in ARM u-boot!"); dev_err(hdev->dev, "Error in ARM u-boot!");
switch (status) { switch (status) {
...@@ -2328,6 +2353,11 @@ static int goya_init_cpu(struct hl_device *hdev, u32 cpu_timeout) ...@@ -2328,6 +2353,11 @@ static int goya_init_cpu(struct hl_device *hdev, u32 cpu_timeout)
"ARM status %d - u-boot stopped by user\n", "ARM status %d - u-boot stopped by user\n",
status); status);
break; break;
case CPU_BOOT_STATUS_TS_INIT_FAIL:
dev_err(hdev->dev,
"ARM status %d - Thermal Sensor initialization failed\n",
status);
break;
default: default:
dev_err(hdev->dev, dev_err(hdev->dev,
"ARM status %d - Invalid status code\n", "ARM status %d - Invalid status code\n",
...@@ -2337,10 +2367,6 @@ static int goya_init_cpu(struct hl_device *hdev, u32 cpu_timeout) ...@@ -2337,10 +2367,6 @@ static int goya_init_cpu(struct hl_device *hdev, u32 cpu_timeout)
return -EIO; return -EIO;
} }
/* Read U-Boot version now in case we will later fail */
goya_read_device_fw_version(hdev, FW_COMP_UBOOT);
goya_read_device_fw_version(hdev, FW_COMP_PREBOOT);
if (!hdev->fw_loading) { if (!hdev->fw_loading) {
dev_info(hdev->dev, "Skip loading FW\n"); dev_info(hdev->dev, "Skip loading FW\n");
goto out; goto out;
...@@ -2453,7 +2479,8 @@ int goya_mmu_init(struct hl_device *hdev) ...@@ -2453,7 +2479,8 @@ int goya_mmu_init(struct hl_device *hdev)
WREG32_AND(mmSTLB_STLB_FEATURE_EN, WREG32_AND(mmSTLB_STLB_FEATURE_EN,
(~STLB_STLB_FEATURE_EN_FOLLOWER_EN_MASK)); (~STLB_STLB_FEATURE_EN_FOLLOWER_EN_MASK));
hdev->asic_funcs->mmu_invalidate_cache(hdev, true); hdev->asic_funcs->mmu_invalidate_cache(hdev, true,
VM_TYPE_USERPTR | VM_TYPE_PHYS_PACK);
WREG32(mmMMU_MMU_ENABLE, 1); WREG32(mmMMU_MMU_ENABLE, 1);
WREG32(mmMMU_SPI_MASK, 0xF); WREG32(mmMMU_SPI_MASK, 0xF);
...@@ -2978,9 +3005,6 @@ int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id) ...@@ -2978,9 +3005,6 @@ int goya_test_queue(struct hl_device *hdev, u32 hw_queue_id)
"H/W queue %d test failed (scratch(0x%08llX) == 0x%08X)\n", "H/W queue %d test failed (scratch(0x%08llX) == 0x%08X)\n",
hw_queue_id, (unsigned long long) fence_dma_addr, tmp); hw_queue_id, (unsigned long long) fence_dma_addr, tmp);
rc = -EIO; rc = -EIO;
} else {
dev_info(hdev->dev, "queue test on H/W queue %d succeeded\n",
hw_queue_id);
} }
free_pkt: free_pkt:
...@@ -3925,7 +3949,7 @@ static int goya_parse_cb_no_ext_queue(struct hl_device *hdev, ...@@ -3925,7 +3949,7 @@ static int goya_parse_cb_no_ext_queue(struct hl_device *hdev,
return 0; return 0;
dev_err(hdev->dev, dev_err(hdev->dev,
"Internal CB address %px + 0x%x is not in SRAM nor in DRAM\n", "Internal CB address 0x%px + 0x%x is not in SRAM nor in DRAM\n",
parser->user_cb, parser->user_cb_size); parser->user_cb, parser->user_cb_size);
return -EFAULT; return -EFAULT;
...@@ -3935,7 +3959,7 @@ int goya_cs_parser(struct hl_device *hdev, struct hl_cs_parser *parser) ...@@ -3935,7 +3959,7 @@ int goya_cs_parser(struct hl_device *hdev, struct hl_cs_parser *parser)
{ {
struct goya_device *goya = hdev->asic_specific; struct goya_device *goya = hdev->asic_specific;
if (!parser->ext_queue) if (parser->queue_type == QUEUE_TYPE_INT)
return goya_parse_cb_no_ext_queue(hdev, parser); return goya_parse_cb_no_ext_queue(hdev, parser);
if (goya->hw_cap_initialized & HW_CAP_MMU) if (goya->hw_cap_initialized & HW_CAP_MMU)
...@@ -4606,7 +4630,7 @@ static int goya_memset_device_memory(struct hl_device *hdev, u64 addr, u64 size, ...@@ -4606,7 +4630,7 @@ static int goya_memset_device_memory(struct hl_device *hdev, u64 addr, u64 size,
lin_dma_pkt++; lin_dma_pkt++;
} while (--lin_dma_pkts_cnt); } while (--lin_dma_pkts_cnt);
job = hl_cs_allocate_job(hdev, true); job = hl_cs_allocate_job(hdev, QUEUE_TYPE_EXT, true);
if (!job) { if (!job) {
dev_err(hdev->dev, "Failed to allocate a new job\n"); dev_err(hdev->dev, "Failed to allocate a new job\n");
rc = -ENOMEM; rc = -ENOMEM;
...@@ -4835,13 +4859,15 @@ static void goya_mmu_prepare(struct hl_device *hdev, u32 asid) ...@@ -4835,13 +4859,15 @@ static void goya_mmu_prepare(struct hl_device *hdev, u32 asid)
goya_mmu_prepare_reg(hdev, goya_mmu_regs[i], asid); goya_mmu_prepare_reg(hdev, goya_mmu_regs[i], asid);
} }
static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard) static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
u32 flags)
{ {
struct goya_device *goya = hdev->asic_specific; struct goya_device *goya = hdev->asic_specific;
u32 status, timeout_usec; u32 status, timeout_usec;
int rc; int rc;
if (!(goya->hw_cap_initialized & HW_CAP_MMU)) if (!(goya->hw_cap_initialized & HW_CAP_MMU) ||
hdev->hard_reset_pending)
return; return;
/* no need in L1 only invalidation in Goya */ /* no need in L1 only invalidation in Goya */
...@@ -4880,7 +4906,8 @@ static void goya_mmu_invalidate_cache_range(struct hl_device *hdev, ...@@ -4880,7 +4906,8 @@ static void goya_mmu_invalidate_cache_range(struct hl_device *hdev,
u32 status, timeout_usec, inv_data, pi; u32 status, timeout_usec, inv_data, pi;
int rc; int rc;
if (!(goya->hw_cap_initialized & HW_CAP_MMU)) if (!(goya->hw_cap_initialized & HW_CAP_MMU) ||
hdev->hard_reset_pending)
return; return;
/* no need in L1 only invalidation in Goya */ /* no need in L1 only invalidation in Goya */
...@@ -5137,7 +5164,8 @@ static const struct hl_asic_funcs goya_funcs = { ...@@ -5137,7 +5164,8 @@ static const struct hl_asic_funcs goya_funcs = {
.init_iatu = goya_init_iatu, .init_iatu = goya_init_iatu,
.rreg = hl_rreg, .rreg = hl_rreg,
.wreg = hl_wreg, .wreg = hl_wreg,
.halt_coresight = goya_halt_coresight .halt_coresight = goya_halt_coresight,
.get_clk_rate = goya_get_clk_rate
}; };
/* /*
......
...@@ -233,4 +233,6 @@ void goya_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size, ...@@ -233,4 +233,6 @@ void goya_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
void *vaddr); void *vaddr);
void goya_mmu_remove_device_cpu_mappings(struct hl_device *hdev); void goya_mmu_remove_device_cpu_mappings(struct hl_device *hdev);
int goya_get_clk_rate(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk);
#endif /* GOYAP_H_ */ #endif /* GOYAP_H_ */
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include "goyaP.h" #include "goyaP.h"
#include "include/goya/goya_coresight.h" #include "include/goya/goya_coresight.h"
#include "include/goya/asic_reg/goya_regs.h" #include "include/goya/asic_reg/goya_regs.h"
#include "include/goya/asic_reg/goya_masks.h"
#include <uapi/misc/habanalabs.h> #include <uapi/misc/habanalabs.h>
...@@ -377,33 +378,32 @@ static int goya_config_etr(struct hl_device *hdev, ...@@ -377,33 +378,32 @@ static int goya_config_etr(struct hl_device *hdev,
struct hl_debug_params *params) struct hl_debug_params *params)
{ {
struct hl_debug_params_etr *input; struct hl_debug_params_etr *input;
u64 base_reg = mmPSOC_ETR_BASE - CFG_BASE;
u32 val; u32 val;
int rc; int rc;
WREG32(base_reg + 0xFB0, CORESIGHT_UNLOCK); WREG32(mmPSOC_ETR_LAR, CORESIGHT_UNLOCK);
val = RREG32(base_reg + 0x304); val = RREG32(mmPSOC_ETR_FFCR);
val |= 0x1000; val |= 0x1000;
WREG32(base_reg + 0x304, val); WREG32(mmPSOC_ETR_FFCR, val);
val |= 0x40; val |= 0x40;
WREG32(base_reg + 0x304, val); WREG32(mmPSOC_ETR_FFCR, val);
rc = goya_coresight_timeout(hdev, base_reg + 0x304, 6, false); rc = goya_coresight_timeout(hdev, mmPSOC_ETR_FFCR, 6, false);
if (rc) { if (rc) {
dev_err(hdev->dev, "Failed to %s ETR on timeout, error %d\n", dev_err(hdev->dev, "Failed to %s ETR on timeout, error %d\n",
params->enable ? "enable" : "disable", rc); params->enable ? "enable" : "disable", rc);
return rc; return rc;
} }
rc = goya_coresight_timeout(hdev, base_reg + 0xC, 2, true); rc = goya_coresight_timeout(hdev, mmPSOC_ETR_STS, 2, true);
if (rc) { if (rc) {
dev_err(hdev->dev, "Failed to %s ETR on timeout, error %d\n", dev_err(hdev->dev, "Failed to %s ETR on timeout, error %d\n",
params->enable ? "enable" : "disable", rc); params->enable ? "enable" : "disable", rc);
return rc; return rc;
} }
WREG32(base_reg + 0x20, 0); WREG32(mmPSOC_ETR_CTL, 0);
if (params->enable) { if (params->enable) {
input = params->input; input = params->input;
...@@ -423,25 +423,26 @@ static int goya_config_etr(struct hl_device *hdev, ...@@ -423,25 +423,26 @@ static int goya_config_etr(struct hl_device *hdev,
return -EINVAL; return -EINVAL;
} }
WREG32(base_reg + 0x34, 0x3FFC); WREG32(mmPSOC_ETR_BUFWM, 0x3FFC);
WREG32(base_reg + 0x4, input->buffer_size); WREG32(mmPSOC_ETR_RSZ, input->buffer_size);
WREG32(base_reg + 0x28, input->sink_mode); WREG32(mmPSOC_ETR_MODE, input->sink_mode);
WREG32(base_reg + 0x110, 0x700); WREG32(mmPSOC_ETR_AXICTL,
WREG32(base_reg + 0x118, 0x700 | PSOC_ETR_AXICTL_PROTCTRLBIT1_SHIFT);
WREG32(mmPSOC_ETR_DBALO,
lower_32_bits(input->buffer_address)); lower_32_bits(input->buffer_address));
WREG32(base_reg + 0x11C, WREG32(mmPSOC_ETR_DBAHI,
upper_32_bits(input->buffer_address)); upper_32_bits(input->buffer_address));
WREG32(base_reg + 0x304, 3); WREG32(mmPSOC_ETR_FFCR, 3);
WREG32(base_reg + 0x308, 0xA); WREG32(mmPSOC_ETR_PSCR, 0xA);
WREG32(base_reg + 0x20, 1); WREG32(mmPSOC_ETR_CTL, 1);
} else { } else {
WREG32(base_reg + 0x34, 0); WREG32(mmPSOC_ETR_BUFWM, 0);
WREG32(base_reg + 0x4, 0x400); WREG32(mmPSOC_ETR_RSZ, 0x400);
WREG32(base_reg + 0x118, 0); WREG32(mmPSOC_ETR_DBALO, 0);
WREG32(base_reg + 0x11C, 0); WREG32(mmPSOC_ETR_DBAHI, 0);
WREG32(base_reg + 0x308, 0); WREG32(mmPSOC_ETR_PSCR, 0);
WREG32(base_reg + 0x28, 0); WREG32(mmPSOC_ETR_MODE, 0);
WREG32(base_reg + 0x304, 0); WREG32(mmPSOC_ETR_FFCR, 0);
if (params->output_size >= sizeof(u64)) { if (params->output_size >= sizeof(u64)) {
u32 rwp, rwphi; u32 rwp, rwphi;
...@@ -451,8 +452,8 @@ static int goya_config_etr(struct hl_device *hdev, ...@@ -451,8 +452,8 @@ static int goya_config_etr(struct hl_device *hdev,
* the buffer is set in the RWP register (lower 32 * the buffer is set in the RWP register (lower 32
* bits), and in the RWPHI register (upper 8 bits). * bits), and in the RWPHI register (upper 8 bits).
*/ */
rwp = RREG32(base_reg + 0x18); rwp = RREG32(mmPSOC_ETR_RWP);
rwphi = RREG32(base_reg + 0x3c) & 0xff; rwphi = RREG32(mmPSOC_ETR_RWPHI) & 0xff;
*(u64 *) params->output = ((u64) rwphi << 32) | rwp; *(u64 *) params->output = ((u64) rwphi << 32) | rwp;
} }
} }
......
...@@ -32,6 +32,37 @@ void goya_set_pll_profile(struct hl_device *hdev, enum hl_pll_frequency freq) ...@@ -32,6 +32,37 @@ void goya_set_pll_profile(struct hl_device *hdev, enum hl_pll_frequency freq)
} }
} }
int goya_get_clk_rate(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk)
{
long value;
if (hl_device_disabled_or_in_reset(hdev))
return -ENODEV;
value = hl_get_frequency(hdev, MME_PLL, false);
if (value < 0) {
dev_err(hdev->dev, "Failed to retrieve device max clock %ld\n",
value);
return value;
}
*max_clk = (value / 1000 / 1000);
value = hl_get_frequency(hdev, MME_PLL, true);
if (value < 0) {
dev_err(hdev->dev,
"Failed to retrieve device current clock %ld\n",
value);
return value;
}
*cur_clk = (value / 1000 / 1000);
return 0;
}
static ssize_t mme_clk_show(struct device *dev, struct device_attribute *attr, static ssize_t mme_clk_show(struct device *dev, struct device_attribute *attr,
char *buf) char *buf)
{ {
......
...@@ -40,8 +40,6 @@ ...@@ -40,8 +40,6 @@
#define HL_MAX_QUEUES 128 #define HL_MAX_QUEUES 128
#define HL_MAX_JOBS_PER_CS 64
/* MUST BE POWER OF 2 and larger than 1 */ /* MUST BE POWER OF 2 and larger than 1 */
#define HL_MAX_PENDING_CS 64 #define HL_MAX_PENDING_CS 64
...@@ -85,12 +83,15 @@ struct hl_fpriv; ...@@ -85,12 +83,15 @@ struct hl_fpriv;
* @QUEUE_TYPE_INT: internal queue that performs DMA inside the device's * @QUEUE_TYPE_INT: internal queue that performs DMA inside the device's
* memories and/or operates the compute engines. * memories and/or operates the compute engines.
* @QUEUE_TYPE_CPU: S/W queue for communication with the device's CPU. * @QUEUE_TYPE_CPU: S/W queue for communication with the device's CPU.
* @QUEUE_TYPE_HW: queue of DMA and compute engines jobs, for which completion
* notifications are sent by H/W.
*/ */
enum hl_queue_type { enum hl_queue_type {
QUEUE_TYPE_NA, QUEUE_TYPE_NA,
QUEUE_TYPE_EXT, QUEUE_TYPE_EXT,
QUEUE_TYPE_INT, QUEUE_TYPE_INT,
QUEUE_TYPE_CPU QUEUE_TYPE_CPU,
QUEUE_TYPE_HW
}; };
/** /**
...@@ -98,10 +99,13 @@ enum hl_queue_type { ...@@ -98,10 +99,13 @@ enum hl_queue_type {
* @type: queue type. * @type: queue type.
* @driver_only: true if only the driver is allowed to send a job to this queue, * @driver_only: true if only the driver is allowed to send a job to this queue,
* false otherwise. * false otherwise.
* @requires_kernel_cb: true if a CB handle must be provided for jobs on this
* queue, false otherwise (a CB address must be provided).
*/ */
struct hw_queue_properties { struct hw_queue_properties {
enum hl_queue_type type; enum hl_queue_type type;
u8 driver_only; u8 driver_only;
u8 requires_kernel_cb;
}; };
/** /**
...@@ -110,8 +114,8 @@ struct hw_queue_properties { ...@@ -110,8 +114,8 @@ struct hw_queue_properties {
* @VM_TYPE_PHYS_PACK: mapping of DRAM memory to device virtual address. * @VM_TYPE_PHYS_PACK: mapping of DRAM memory to device virtual address.
*/ */
enum vm_type_t { enum vm_type_t {
VM_TYPE_USERPTR, VM_TYPE_USERPTR = 0x1,
VM_TYPE_PHYS_PACK VM_TYPE_PHYS_PACK = 0x2
}; };
/** /**
...@@ -126,6 +130,36 @@ enum hl_device_hw_state { ...@@ -126,6 +130,36 @@ enum hl_device_hw_state {
HL_DEVICE_HW_STATE_DIRTY HL_DEVICE_HW_STATE_DIRTY
}; };
/**
* struct hl_mmu_properties - ASIC specific MMU address translation properties.
* @hop0_shift: shift of hop 0 mask.
* @hop1_shift: shift of hop 1 mask.
* @hop2_shift: shift of hop 2 mask.
* @hop3_shift: shift of hop 3 mask.
* @hop4_shift: shift of hop 4 mask.
* @hop0_mask: mask to get the PTE address in hop 0.
* @hop1_mask: mask to get the PTE address in hop 1.
* @hop2_mask: mask to get the PTE address in hop 2.
* @hop3_mask: mask to get the PTE address in hop 3.
* @hop4_mask: mask to get the PTE address in hop 4.
* @page_size: default page size used to allocate memory.
* @huge_page_size: page size used to allocate memory with huge pages.
*/
struct hl_mmu_properties {
u64 hop0_shift;
u64 hop1_shift;
u64 hop2_shift;
u64 hop3_shift;
u64 hop4_shift;
u64 hop0_mask;
u64 hop1_mask;
u64 hop2_mask;
u64 hop3_mask;
u64 hop4_mask;
u32 page_size;
u32 huge_page_size;
};
/** /**
* struct asic_fixed_properties - ASIC specific immutable properties. * struct asic_fixed_properties - ASIC specific immutable properties.
* @hw_queues_props: H/W queues properties. * @hw_queues_props: H/W queues properties.
...@@ -133,6 +167,8 @@ enum hl_device_hw_state { ...@@ -133,6 +167,8 @@ enum hl_device_hw_state {
* available sensors. * available sensors.
* @uboot_ver: F/W U-boot version. * @uboot_ver: F/W U-boot version.
* @preboot_ver: F/W Preboot version. * @preboot_ver: F/W Preboot version.
* @dmmu: DRAM MMU address translation properties.
* @pmmu: PCI (host) MMU address translation properties.
* @sram_base_address: SRAM physical start address. * @sram_base_address: SRAM physical start address.
* @sram_end_address: SRAM physical end address. * @sram_end_address: SRAM physical end address.
* @sram_user_base_address - SRAM physical start address for user access. * @sram_user_base_address - SRAM physical start address for user access.
...@@ -169,17 +205,19 @@ enum hl_device_hw_state { ...@@ -169,17 +205,19 @@ enum hl_device_hw_state {
* @psoc_pci_pll_nf: PCI PLL NF value. * @psoc_pci_pll_nf: PCI PLL NF value.
* @psoc_pci_pll_od: PCI PLL OD value. * @psoc_pci_pll_od: PCI PLL OD value.
* @psoc_pci_pll_div_factor: PCI PLL DIV FACTOR 1 value. * @psoc_pci_pll_div_factor: PCI PLL DIV FACTOR 1 value.
* @completion_queues_count: number of completion queues.
* @high_pll: high PLL frequency used by the device. * @high_pll: high PLL frequency used by the device.
* @cb_pool_cb_cnt: number of CBs in the CB pool. * @cb_pool_cb_cnt: number of CBs in the CB pool.
* @cb_pool_cb_size: size of each CB in the CB pool. * @cb_pool_cb_size: size of each CB in the CB pool.
* @tpc_enabled_mask: which TPCs are enabled. * @tpc_enabled_mask: which TPCs are enabled.
* @completion_queues_count: number of completion queues.
*/ */
struct asic_fixed_properties { struct asic_fixed_properties {
struct hw_queue_properties hw_queues_props[HL_MAX_QUEUES]; struct hw_queue_properties hw_queues_props[HL_MAX_QUEUES];
struct armcp_info armcp_info; struct armcp_info armcp_info;
char uboot_ver[VERSION_MAX_LEN]; char uboot_ver[VERSION_MAX_LEN];
char preboot_ver[VERSION_MAX_LEN]; char preboot_ver[VERSION_MAX_LEN];
struct hl_mmu_properties dmmu;
struct hl_mmu_properties pmmu;
u64 sram_base_address; u64 sram_base_address;
u64 sram_end_address; u64 sram_end_address;
u64 sram_user_base_address; u64 sram_user_base_address;
...@@ -214,8 +252,8 @@ struct asic_fixed_properties { ...@@ -214,8 +252,8 @@ struct asic_fixed_properties {
u32 high_pll; u32 high_pll;
u32 cb_pool_cb_cnt; u32 cb_pool_cb_cnt;
u32 cb_pool_cb_size; u32 cb_pool_cb_size;
u8 completion_queues_count;
u8 tpc_enabled_mask; u8 tpc_enabled_mask;
u8 completion_queues_count;
}; };
/** /**
...@@ -236,8 +274,6 @@ struct hl_dma_fence { ...@@ -236,8 +274,6 @@ struct hl_dma_fence {
* Command Buffers * Command Buffers
*/ */
#define HL_MAX_CB_SIZE 0x200000 /* 2MB */
/** /**
* struct hl_cb_mgr - describes a Command Buffer Manager. * struct hl_cb_mgr - describes a Command Buffer Manager.
* @cb_lock: protects cb_handles. * @cb_lock: protects cb_handles.
...@@ -481,8 +517,8 @@ enum hl_pll_frequency { ...@@ -481,8 +517,8 @@ enum hl_pll_frequency {
* @get_events_stat: retrieve event queue entries histogram. * @get_events_stat: retrieve event queue entries histogram.
* @read_pte: read MMU page table entry from DRAM. * @read_pte: read MMU page table entry from DRAM.
* @write_pte: write MMU page table entry to DRAM. * @write_pte: write MMU page table entry to DRAM.
* @mmu_invalidate_cache: flush MMU STLB cache, either with soft (L1 only) or * @mmu_invalidate_cache: flush MMU STLB host/DRAM cache, either with soft
* hard (L0 & L1) flush. * (L1 only) or hard (L0 & L1) flush.
* @mmu_invalidate_cache_range: flush specific MMU STLB cache lines with * @mmu_invalidate_cache_range: flush specific MMU STLB cache lines with
* ASID-VA-size mask. * ASID-VA-size mask.
* @send_heartbeat: send is-alive packet to ArmCP and verify response. * @send_heartbeat: send is-alive packet to ArmCP and verify response.
...@@ -502,6 +538,7 @@ enum hl_pll_frequency { ...@@ -502,6 +538,7 @@ enum hl_pll_frequency {
* @rreg: Read a register. Needed for simulator support. * @rreg: Read a register. Needed for simulator support.
* @wreg: Write a register. Needed for simulator support. * @wreg: Write a register. Needed for simulator support.
* @halt_coresight: stop the ETF and ETR traces. * @halt_coresight: stop the ETF and ETR traces.
* @get_clk_rate: Retrieve the ASIC current and maximum clock rate in MHz
*/ */
struct hl_asic_funcs { struct hl_asic_funcs {
int (*early_init)(struct hl_device *hdev); int (*early_init)(struct hl_device *hdev);
...@@ -562,7 +599,8 @@ struct hl_asic_funcs { ...@@ -562,7 +599,8 @@ struct hl_asic_funcs {
u32 *size); u32 *size);
u64 (*read_pte)(struct hl_device *hdev, u64 addr); u64 (*read_pte)(struct hl_device *hdev, u64 addr);
void (*write_pte)(struct hl_device *hdev, u64 addr, u64 val); void (*write_pte)(struct hl_device *hdev, u64 addr, u64 val);
void (*mmu_invalidate_cache)(struct hl_device *hdev, bool is_hard); void (*mmu_invalidate_cache)(struct hl_device *hdev, bool is_hard,
u32 flags);
void (*mmu_invalidate_cache_range)(struct hl_device *hdev, bool is_hard, void (*mmu_invalidate_cache_range)(struct hl_device *hdev, bool is_hard,
u32 asid, u64 va, u64 size); u32 asid, u64 va, u64 size);
int (*send_heartbeat)(struct hl_device *hdev); int (*send_heartbeat)(struct hl_device *hdev);
...@@ -584,6 +622,7 @@ struct hl_asic_funcs { ...@@ -584,6 +622,7 @@ struct hl_asic_funcs {
u32 (*rreg)(struct hl_device *hdev, u32 reg); u32 (*rreg)(struct hl_device *hdev, u32 reg);
void (*wreg)(struct hl_device *hdev, u32 reg, u32 val); void (*wreg)(struct hl_device *hdev, u32 reg, u32 val);
void (*halt_coresight)(struct hl_device *hdev); void (*halt_coresight)(struct hl_device *hdev);
int (*get_clk_rate)(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk);
}; };
...@@ -688,7 +727,7 @@ struct hl_ctx_mgr { ...@@ -688,7 +727,7 @@ struct hl_ctx_mgr {
* @sgt: pointer to the scatter-gather table that holds the pages. * @sgt: pointer to the scatter-gather table that holds the pages.
* @dir: for DMA unmapping, the direction must be supplied, so save it. * @dir: for DMA unmapping, the direction must be supplied, so save it.
* @debugfs_list: node in debugfs list of command submissions. * @debugfs_list: node in debugfs list of command submissions.
* @addr: user-space virtual pointer to the start of the memory area. * @addr: user-space virtual address of the start of the memory area.
* @size: size of the memory area to pin & map. * @size: size of the memory area to pin & map.
* @dma_mapped: true if the SG was mapped to DMA addresses, false otherwise. * @dma_mapped: true if the SG was mapped to DMA addresses, false otherwise.
*/ */
...@@ -752,11 +791,14 @@ struct hl_cs { ...@@ -752,11 +791,14 @@ struct hl_cs {
* @userptr_list: linked-list of userptr mappings that belong to this job and * @userptr_list: linked-list of userptr mappings that belong to this job and
* wait for completion. * wait for completion.
* @debugfs_list: node in debugfs list of command submission jobs. * @debugfs_list: node in debugfs list of command submission jobs.
* @queue_type: the type of the H/W queue this job is submitted to.
* @id: the id of this job inside a CS. * @id: the id of this job inside a CS.
* @hw_queue_id: the id of the H/W queue this job is submitted to. * @hw_queue_id: the id of the H/W queue this job is submitted to.
* @user_cb_size: the actual size of the CB we got from the user. * @user_cb_size: the actual size of the CB we got from the user.
* @job_cb_size: the actual size of the CB that we put on the queue. * @job_cb_size: the actual size of the CB that we put on the queue.
* @ext_queue: whether the job is for external queue or internal queue. * @is_kernel_allocated_cb: true if the CB handle we got from the user holds a
* handle to a kernel-allocated CB object, false
* otherwise (SRAM/DRAM/host address).
*/ */
struct hl_cs_job { struct hl_cs_job {
struct list_head cs_node; struct list_head cs_node;
...@@ -766,39 +808,44 @@ struct hl_cs_job { ...@@ -766,39 +808,44 @@ struct hl_cs_job {
struct work_struct finish_work; struct work_struct finish_work;
struct list_head userptr_list; struct list_head userptr_list;
struct list_head debugfs_list; struct list_head debugfs_list;
enum hl_queue_type queue_type;
u32 id; u32 id;
u32 hw_queue_id; u32 hw_queue_id;
u32 user_cb_size; u32 user_cb_size;
u32 job_cb_size; u32 job_cb_size;
u8 ext_queue; u8 is_kernel_allocated_cb;
}; };
/** /**
* struct hl_cs_parser - command submission paerser properties. * struct hl_cs_parser - command submission parser properties.
* @user_cb: the CB we got from the user. * @user_cb: the CB we got from the user.
* @patched_cb: in case of patching, this is internal CB which is submitted on * @patched_cb: in case of patching, this is internal CB which is submitted on
* the queue instead of the CB we got from the IOCTL. * the queue instead of the CB we got from the IOCTL.
* @job_userptr_list: linked-list of userptr mappings that belong to the related * @job_userptr_list: linked-list of userptr mappings that belong to the related
* job and wait for completion. * job and wait for completion.
* @cs_sequence: the sequence number of the related CS. * @cs_sequence: the sequence number of the related CS.
* @queue_type: the type of the H/W queue this job is submitted to.
* @ctx_id: the ID of the context the related CS belongs to. * @ctx_id: the ID of the context the related CS belongs to.
* @hw_queue_id: the id of the H/W queue this job is submitted to. * @hw_queue_id: the id of the H/W queue this job is submitted to.
* @user_cb_size: the actual size of the CB we got from the user. * @user_cb_size: the actual size of the CB we got from the user.
* @patched_cb_size: the size of the CB after parsing. * @patched_cb_size: the size of the CB after parsing.
* @ext_queue: whether the job is for external queue or internal queue.
* @job_id: the id of the related job inside the related CS. * @job_id: the id of the related job inside the related CS.
* @is_kernel_allocated_cb: true if the CB handle we got from the user holds a
* handle to a kernel-allocated CB object, false
* otherwise (SRAM/DRAM/host address).
*/ */
struct hl_cs_parser { struct hl_cs_parser {
struct hl_cb *user_cb; struct hl_cb *user_cb;
struct hl_cb *patched_cb; struct hl_cb *patched_cb;
struct list_head *job_userptr_list; struct list_head *job_userptr_list;
u64 cs_sequence; u64 cs_sequence;
enum hl_queue_type queue_type;
u32 ctx_id; u32 ctx_id;
u32 hw_queue_id; u32 hw_queue_id;
u32 user_cb_size; u32 user_cb_size;
u32 patched_cb_size; u32 patched_cb_size;
u8 ext_queue;
u8 job_id; u8 job_id;
u8 is_kernel_allocated_cb;
}; };
...@@ -1048,8 +1095,9 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); ...@@ -1048,8 +1095,9 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
#define REG_FIELD_SHIFT(reg, field) reg##_##field##_SHIFT #define REG_FIELD_SHIFT(reg, field) reg##_##field##_SHIFT
#define REG_FIELD_MASK(reg, field) reg##_##field##_MASK #define REG_FIELD_MASK(reg, field) reg##_##field##_MASK
#define WREG32_FIELD(reg, field, val) \ #define WREG32_FIELD(reg, offset, field, val) \
WREG32(mm##reg, (RREG32(mm##reg) & ~REG_FIELD_MASK(reg, field)) | \ WREG32(mm##reg + offset, (RREG32(mm##reg + offset) & \
~REG_FIELD_MASK(reg, field)) | \
(val) << REG_FIELD_SHIFT(reg, field)) (val) << REG_FIELD_SHIFT(reg, field))
/* Timeout should be longer when working with simulator but cap the /* Timeout should be longer when working with simulator but cap the
...@@ -1501,7 +1549,8 @@ int hl_cb_pool_init(struct hl_device *hdev); ...@@ -1501,7 +1549,8 @@ int hl_cb_pool_init(struct hl_device *hdev);
int hl_cb_pool_fini(struct hl_device *hdev); int hl_cb_pool_fini(struct hl_device *hdev);
void hl_cs_rollback_all(struct hl_device *hdev); void hl_cs_rollback_all(struct hl_device *hdev);
struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue); struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
enum hl_queue_type queue_type, bool is_kernel_allocated_cb);
void goya_set_asic_funcs(struct hl_device *hdev); void goya_set_asic_funcs(struct hl_device *hdev);
...@@ -1513,7 +1562,7 @@ void hl_vm_fini(struct hl_device *hdev); ...@@ -1513,7 +1562,7 @@ void hl_vm_fini(struct hl_device *hdev);
int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size, int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size,
struct hl_userptr *userptr); struct hl_userptr *userptr);
int hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr); void hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr);
void hl_userptr_delete_list(struct hl_device *hdev, void hl_userptr_delete_list(struct hl_device *hdev,
struct list_head *userptr_list); struct list_head *userptr_list);
bool hl_userptr_is_pinned(struct hl_device *hdev, u64 addr, u32 size, bool hl_userptr_is_pinned(struct hl_device *hdev, u64 addr, u32 size,
......
...@@ -60,11 +60,16 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args) ...@@ -60,11 +60,16 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
hw_ip.tpc_enabled_mask = prop->tpc_enabled_mask; hw_ip.tpc_enabled_mask = prop->tpc_enabled_mask;
hw_ip.sram_size = prop->sram_size - sram_kmd_size; hw_ip.sram_size = prop->sram_size - sram_kmd_size;
hw_ip.dram_size = prop->dram_size - dram_kmd_size; hw_ip.dram_size = prop->dram_size - dram_kmd_size;
if (hw_ip.dram_size > 0) if (hw_ip.dram_size > PAGE_SIZE)
hw_ip.dram_enabled = 1; hw_ip.dram_enabled = 1;
hw_ip.num_of_events = prop->num_of_events; hw_ip.num_of_events = prop->num_of_events;
memcpy(hw_ip.armcp_version,
prop->armcp_info.armcp_version, VERSION_MAX_LEN); memcpy(hw_ip.armcp_version, prop->armcp_info.armcp_version,
min(VERSION_MAX_LEN, HL_INFO_VERSION_MAX_LEN));
memcpy(hw_ip.card_name, prop->armcp_info.card_name,
min(CARD_NAME_MAX_LEN, HL_INFO_CARD_NAME_MAX_LEN));
hw_ip.armcp_cpld_version = le32_to_cpu(prop->armcp_info.cpld_version); hw_ip.armcp_cpld_version = le32_to_cpu(prop->armcp_info.cpld_version);
hw_ip.psoc_pci_pll_nr = prop->psoc_pci_pll_nr; hw_ip.psoc_pci_pll_nr = prop->psoc_pci_pll_nr;
hw_ip.psoc_pci_pll_nf = prop->psoc_pci_pll_nf; hw_ip.psoc_pci_pll_nf = prop->psoc_pci_pll_nf;
...@@ -179,16 +184,13 @@ static int debug_coresight(struct hl_device *hdev, struct hl_debug_args *args) ...@@ -179,16 +184,13 @@ static int debug_coresight(struct hl_device *hdev, struct hl_debug_args *args)
goto out; goto out;
} }
if (output) { if (output && copy_to_user((void __user *) (uintptr_t) args->output_ptr,
if (copy_to_user((void __user *) (uintptr_t) args->output_ptr, output, args->output_size)) {
output, dev_err(hdev->dev, "copy to user failed in debug ioctl\n");
args->output_size)) {
dev_err(hdev->dev,
"copy to user failed in debug ioctl\n");
rc = -EFAULT; rc = -EFAULT;
goto out; goto out;
} }
}
out: out:
kfree(params); kfree(params);
...@@ -221,6 +223,41 @@ static int device_utilization(struct hl_device *hdev, struct hl_info_args *args) ...@@ -221,6 +223,41 @@ static int device_utilization(struct hl_device *hdev, struct hl_info_args *args)
min((size_t) max_size, sizeof(device_util))) ? -EFAULT : 0; min((size_t) max_size, sizeof(device_util))) ? -EFAULT : 0;
} }
static int get_clk_rate(struct hl_device *hdev, struct hl_info_args *args)
{
struct hl_info_clk_rate clk_rate = {0};
u32 max_size = args->return_size;
void __user *out = (void __user *) (uintptr_t) args->return_pointer;
int rc;
if ((!max_size) || (!out))
return -EINVAL;
rc = hdev->asic_funcs->get_clk_rate(hdev, &clk_rate.cur_clk_rate_mhz,
&clk_rate.max_clk_rate_mhz);
if (rc)
return rc;
return copy_to_user(out, &clk_rate,
min((size_t) max_size, sizeof(clk_rate))) ? -EFAULT : 0;
}
static int get_reset_count(struct hl_device *hdev, struct hl_info_args *args)
{
struct hl_info_reset_count reset_count = {0};
u32 max_size = args->return_size;
void __user *out = (void __user *) (uintptr_t) args->return_pointer;
if ((!max_size) || (!out))
return -EINVAL;
reset_count.hard_reset_cnt = hdev->hard_reset_cnt;
reset_count.soft_reset_cnt = hdev->soft_reset_cnt;
return copy_to_user(out, &reset_count,
min((size_t) max_size, sizeof(reset_count))) ? -EFAULT : 0;
}
static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
struct device *dev) struct device *dev)
{ {
...@@ -239,6 +276,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, ...@@ -239,6 +276,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
case HL_INFO_DEVICE_STATUS: case HL_INFO_DEVICE_STATUS:
return device_status_info(hdev, args); return device_status_info(hdev, args);
case HL_INFO_RESET_COUNT:
return get_reset_count(hdev, args);
default: default:
break; break;
} }
...@@ -271,6 +311,10 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, ...@@ -271,6 +311,10 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
rc = hw_events_info(hdev, true, args); rc = hw_events_info(hdev, true, args);
break; break;
case HL_INFO_CLK_RATE:
rc = get_clk_rate(hdev, args);
break;
default: default:
dev_err(dev, "Invalid request %d\n", args->op); dev_err(dev, "Invalid request %d\n", args->op);
rc = -ENOTTY; rc = -ENOTTY;
...@@ -406,8 +450,7 @@ static long _hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg, ...@@ -406,8 +450,7 @@ static long _hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg,
retcode = func(hpriv, kdata); retcode = func(hpriv, kdata);
if (cmd & IOC_OUT) if ((cmd & IOC_OUT) && copy_to_user((void __user *)arg, kdata, usize))
if (copy_to_user((void __user *)arg, kdata, usize))
retcode = -EFAULT; retcode = -EFAULT;
out_err: out_err:
......
...@@ -58,8 +58,8 @@ void hl_int_hw_queue_update_ci(struct hl_cs *cs) ...@@ -58,8 +58,8 @@ void hl_int_hw_queue_update_ci(struct hl_cs *cs)
} }
/* /*
* ext_queue_submit_bd - Submit a buffer descriptor to an external queue * ext_and_hw_queue_submit_bd() - Submit a buffer descriptor to an external or a
* * H/W queue.
* @hdev: pointer to habanalabs device structure * @hdev: pointer to habanalabs device structure
* @q: pointer to habanalabs queue structure * @q: pointer to habanalabs queue structure
* @ctl: BD's control word * @ctl: BD's control word
...@@ -73,8 +73,8 @@ void hl_int_hw_queue_update_ci(struct hl_cs *cs) ...@@ -73,8 +73,8 @@ void hl_int_hw_queue_update_ci(struct hl_cs *cs)
* This function must be called when the scheduler mutex is taken * This function must be called when the scheduler mutex is taken
* *
*/ */
static void ext_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q, static void ext_and_hw_queue_submit_bd(struct hl_device *hdev,
u32 ctl, u32 len, u64 ptr) struct hl_hw_queue *q, u32 ctl, u32 len, u64 ptr)
{ {
struct hl_bd *bd; struct hl_bd *bd;
...@@ -173,6 +173,45 @@ static int int_queue_sanity_checks(struct hl_device *hdev, ...@@ -173,6 +173,45 @@ static int int_queue_sanity_checks(struct hl_device *hdev,
return 0; return 0;
} }
/*
* hw_queue_sanity_checks() - Perform some sanity checks on a H/W queue.
* @hdev: Pointer to hl_device structure.
* @q: Pointer to hl_hw_queue structure.
* @num_of_entries: How many entries to check for space.
*
* Perform the following:
* - Make sure we have enough space in the completion queue.
* This check also ensures that there is enough space in the h/w queue, as
* both queues are of the same size.
* - Reserve space in the completion queue (needs to be reversed if there
* is a failure down the road before the actual submission of work).
*
* Both operations are done using the "free_slots_cnt" field of the completion
* queue. The CI counters of the queue and the completion queue are not
* needed/used for the H/W queue type.
*/
static int hw_queue_sanity_checks(struct hl_device *hdev, struct hl_hw_queue *q,
int num_of_entries)
{
atomic_t *free_slots =
&hdev->completion_queue[q->hw_queue_id].free_slots_cnt;
/*
* Check we have enough space in the completion queue.
* Add -1 to counter (decrement) unless counter was already 0.
* In that case, CQ is full so we can't submit a new CB.
* atomic_add_unless will return 0 if counter was already 0.
*/
if (atomic_add_negative(num_of_entries * -1, free_slots)) {
dev_dbg(hdev->dev, "No space for %d entries on CQ %d\n",
num_of_entries, q->hw_queue_id);
atomic_add(num_of_entries, free_slots);
return -EAGAIN;
}
return 0;
}
/* /*
* hl_hw_queue_send_cb_no_cmpl - send a single CB (not a JOB) without completion * hl_hw_queue_send_cb_no_cmpl - send a single CB (not a JOB) without completion
* *
...@@ -188,7 +227,7 @@ int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id, ...@@ -188,7 +227,7 @@ int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
u32 cb_size, u64 cb_ptr) u32 cb_size, u64 cb_ptr)
{ {
struct hl_hw_queue *q = &hdev->kernel_queues[hw_queue_id]; struct hl_hw_queue *q = &hdev->kernel_queues[hw_queue_id];
int rc; int rc = 0;
/* /*
* The CPU queue is a synchronous queue with an effective depth of * The CPU queue is a synchronous queue with an effective depth of
...@@ -206,11 +245,18 @@ int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id, ...@@ -206,11 +245,18 @@ int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
goto out; goto out;
} }
/*
* hl_hw_queue_send_cb_no_cmpl() is called for queues of a H/W queue
* type only on init phase, when the queues are empty and being tested,
* so there is no need for sanity checks.
*/
if (q->queue_type != QUEUE_TYPE_HW) {
rc = ext_queue_sanity_checks(hdev, q, 1, false); rc = ext_queue_sanity_checks(hdev, q, 1, false);
if (rc) if (rc)
goto out; goto out;
}
ext_queue_submit_bd(hdev, q, 0, cb_size, cb_ptr); ext_and_hw_queue_submit_bd(hdev, q, 0, cb_size, cb_ptr);
out: out:
if (q->queue_type != QUEUE_TYPE_CPU) if (q->queue_type != QUEUE_TYPE_CPU)
...@@ -220,14 +266,14 @@ int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id, ...@@ -220,14 +266,14 @@ int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
} }
/* /*
* ext_hw_queue_schedule_job - submit an JOB to an external queue * ext_queue_schedule_job - submit a JOB to an external queue
* *
* @job: pointer to the job that needs to be submitted to the queue * @job: pointer to the job that needs to be submitted to the queue
* *
* This function must be called when the scheduler mutex is taken * This function must be called when the scheduler mutex is taken
* *
*/ */
static void ext_hw_queue_schedule_job(struct hl_cs_job *job) static void ext_queue_schedule_job(struct hl_cs_job *job)
{ {
struct hl_device *hdev = job->cs->ctx->hdev; struct hl_device *hdev = job->cs->ctx->hdev;
struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id]; struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id];
...@@ -260,7 +306,7 @@ static void ext_hw_queue_schedule_job(struct hl_cs_job *job) ...@@ -260,7 +306,7 @@ static void ext_hw_queue_schedule_job(struct hl_cs_job *job)
* H/W queues is done under the scheduler mutex * H/W queues is done under the scheduler mutex
* *
* No need to check if CQ is full because it was already * No need to check if CQ is full because it was already
* checked in hl_queue_sanity_checks * checked in ext_queue_sanity_checks
*/ */
cq = &hdev->completion_queue[q->hw_queue_id]; cq = &hdev->completion_queue[q->hw_queue_id];
cq_addr = cq->bus_address + cq->pi * sizeof(struct hl_cq_entry); cq_addr = cq->bus_address + cq->pi * sizeof(struct hl_cq_entry);
...@@ -274,18 +320,18 @@ static void ext_hw_queue_schedule_job(struct hl_cs_job *job) ...@@ -274,18 +320,18 @@ static void ext_hw_queue_schedule_job(struct hl_cs_job *job)
cq->pi = hl_cq_inc_ptr(cq->pi); cq->pi = hl_cq_inc_ptr(cq->pi);
ext_queue_submit_bd(hdev, q, ctl, len, ptr); ext_and_hw_queue_submit_bd(hdev, q, ctl, len, ptr);
} }
/* /*
* int_hw_queue_schedule_job - submit an JOB to an internal queue * int_queue_schedule_job - submit a JOB to an internal queue
* *
* @job: pointer to the job that needs to be submitted to the queue * @job: pointer to the job that needs to be submitted to the queue
* *
* This function must be called when the scheduler mutex is taken * This function must be called when the scheduler mutex is taken
* *
*/ */
static void int_hw_queue_schedule_job(struct hl_cs_job *job) static void int_queue_schedule_job(struct hl_cs_job *job)
{ {
struct hl_device *hdev = job->cs->ctx->hdev; struct hl_device *hdev = job->cs->ctx->hdev;
struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id]; struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id];
...@@ -307,6 +353,60 @@ static void int_hw_queue_schedule_job(struct hl_cs_job *job) ...@@ -307,6 +353,60 @@ static void int_hw_queue_schedule_job(struct hl_cs_job *job)
hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi); hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi);
} }
/*
* hw_queue_schedule_job - submit a JOB to a H/W queue
*
* @job: pointer to the job that needs to be submitted to the queue
*
* This function must be called when the scheduler mutex is taken
*
*/
static void hw_queue_schedule_job(struct hl_cs_job *job)
{
struct hl_device *hdev = job->cs->ctx->hdev;
struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id];
struct hl_cq *cq;
u64 ptr;
u32 offset, ctl, len;
/*
* Upon PQE completion, COMP_DATA is used as the write data to the
* completion queue (QMAN HBW message), and COMP_OFFSET is used as the
* write address offset in the SM block (QMAN LBW message).
* The write address offset is calculated as "COMP_OFFSET << 2".
*/
offset = job->cs->sequence & (HL_MAX_PENDING_CS - 1);
ctl = ((offset << BD_CTL_COMP_OFFSET_SHIFT) & BD_CTL_COMP_OFFSET_MASK) |
((q->pi << BD_CTL_COMP_DATA_SHIFT) & BD_CTL_COMP_DATA_MASK);
len = job->job_cb_size;
/*
* A patched CB is created only if a user CB was allocated by driver and
* MMU is disabled. If MMU is enabled, the user CB should be used
* instead. If the user CB wasn't allocated by driver, assume that it
* holds an address.
*/
if (job->patched_cb)
ptr = job->patched_cb->bus_address;
else if (job->is_kernel_allocated_cb)
ptr = job->user_cb->bus_address;
else
ptr = (u64) (uintptr_t) job->user_cb;
/*
* No need to protect pi_offset because scheduling to the
* H/W queues is done under the scheduler mutex
*
* No need to check if CQ is full because it was already
* checked in hw_queue_sanity_checks
*/
cq = &hdev->completion_queue[q->hw_queue_id];
cq->pi = hl_cq_inc_ptr(cq->pi);
ext_and_hw_queue_submit_bd(hdev, q, ctl, len, ptr);
}
/* /*
* hl_hw_queue_schedule_cs - schedule a command submission * hl_hw_queue_schedule_cs - schedule a command submission
* *
...@@ -330,23 +430,34 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs) ...@@ -330,23 +430,34 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
} }
q = &hdev->kernel_queues[0]; q = &hdev->kernel_queues[0];
/* This loop assumes all external queues are consecutive */
for (i = 0, cq_cnt = 0 ; i < HL_MAX_QUEUES ; i++, q++) { for (i = 0, cq_cnt = 0 ; i < HL_MAX_QUEUES ; i++, q++) {
if (q->queue_type == QUEUE_TYPE_EXT) {
if (cs->jobs_in_queue_cnt[i]) { if (cs->jobs_in_queue_cnt[i]) {
switch (q->queue_type) {
case QUEUE_TYPE_EXT:
rc = ext_queue_sanity_checks(hdev, q, rc = ext_queue_sanity_checks(hdev, q,
cs->jobs_in_queue_cnt[i], true); cs->jobs_in_queue_cnt[i], true);
if (rc) break;
goto unroll_cq_resv; case QUEUE_TYPE_INT:
cq_cnt++;
}
} else if (q->queue_type == QUEUE_TYPE_INT) {
if (cs->jobs_in_queue_cnt[i]) {
rc = int_queue_sanity_checks(hdev, q, rc = int_queue_sanity_checks(hdev, q,
cs->jobs_in_queue_cnt[i]); cs->jobs_in_queue_cnt[i]);
break;
case QUEUE_TYPE_HW:
rc = hw_queue_sanity_checks(hdev, q,
cs->jobs_in_queue_cnt[i]);
break;
default:
dev_err(hdev->dev, "Queue type %d is invalid\n",
q->queue_type);
rc = -EINVAL;
break;
}
if (rc) if (rc)
goto unroll_cq_resv; goto unroll_cq_resv;
}
if (q->queue_type == QUEUE_TYPE_EXT ||
q->queue_type == QUEUE_TYPE_HW)
cq_cnt++;
} }
} }
...@@ -373,21 +484,30 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs) ...@@ -373,21 +484,30 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
} }
list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node) list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
if (job->ext_queue) switch (job->queue_type) {
ext_hw_queue_schedule_job(job); case QUEUE_TYPE_EXT:
else ext_queue_schedule_job(job);
int_hw_queue_schedule_job(job); break;
case QUEUE_TYPE_INT:
int_queue_schedule_job(job);
break;
case QUEUE_TYPE_HW:
hw_queue_schedule_job(job);
break;
default:
break;
}
cs->submitted = true; cs->submitted = true;
goto out; goto out;
unroll_cq_resv: unroll_cq_resv:
/* This loop assumes all external queues are consecutive */
q = &hdev->kernel_queues[0]; q = &hdev->kernel_queues[0];
for (i = 0 ; (i < HL_MAX_QUEUES) && (cq_cnt > 0) ; i++, q++) { for (i = 0 ; (i < HL_MAX_QUEUES) && (cq_cnt > 0) ; i++, q++) {
if ((q->queue_type == QUEUE_TYPE_EXT) && if ((q->queue_type == QUEUE_TYPE_EXT ||
(cs->jobs_in_queue_cnt[i])) { q->queue_type == QUEUE_TYPE_HW) &&
cs->jobs_in_queue_cnt[i]) {
atomic_t *free_slots = atomic_t *free_slots =
&hdev->completion_queue[i].free_slots_cnt; &hdev->completion_queue[i].free_slots_cnt;
atomic_add(cs->jobs_in_queue_cnt[i], free_slots); atomic_add(cs->jobs_in_queue_cnt[i], free_slots);
...@@ -414,8 +534,8 @@ void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id) ...@@ -414,8 +534,8 @@ void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id)
q->ci = hl_queue_inc_ptr(q->ci); q->ci = hl_queue_inc_ptr(q->ci);
} }
static int ext_and_cpu_hw_queue_init(struct hl_device *hdev, static int ext_and_cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
struct hl_hw_queue *q, bool is_cpu_queue) bool is_cpu_queue)
{ {
void *p; void *p;
int rc; int rc;
...@@ -465,7 +585,7 @@ static int ext_and_cpu_hw_queue_init(struct hl_device *hdev, ...@@ -465,7 +585,7 @@ static int ext_and_cpu_hw_queue_init(struct hl_device *hdev,
return rc; return rc;
} }
static int int_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) static int int_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
{ {
void *p; void *p;
...@@ -485,18 +605,38 @@ static int int_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) ...@@ -485,18 +605,38 @@ static int int_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
return 0; return 0;
} }
static int cpu_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) static int cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
{
return ext_and_cpu_queue_init(hdev, q, true);
}
static int ext_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
{ {
return ext_and_cpu_hw_queue_init(hdev, q, true); return ext_and_cpu_queue_init(hdev, q, false);
} }
static int ext_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
{ {
return ext_and_cpu_hw_queue_init(hdev, q, false); void *p;
p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev,
HL_QUEUE_SIZE_IN_BYTES,
&q->bus_address,
GFP_KERNEL | __GFP_ZERO);
if (!p)
return -ENOMEM;
q->kernel_address = (u64) (uintptr_t) p;
/* Make sure read/write pointers are initialized to start of queue */
q->ci = 0;
q->pi = 0;
return 0;
} }
/* /*
* hw_queue_init - main initialization function for H/W queue object * queue_init - main initialization function for H/W queue object
* *
* @hdev: pointer to hl_device device structure * @hdev: pointer to hl_device device structure
* @q: pointer to hl_hw_queue queue structure * @q: pointer to hl_hw_queue queue structure
...@@ -505,7 +645,7 @@ static int ext_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) ...@@ -505,7 +645,7 @@ static int ext_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
* Allocate dma-able memory for the queue and initialize fields * Allocate dma-able memory for the queue and initialize fields
* Returns 0 on success * Returns 0 on success
*/ */
static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q, static int queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
u32 hw_queue_id) u32 hw_queue_id)
{ {
int rc; int rc;
...@@ -516,21 +656,20 @@ static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q, ...@@ -516,21 +656,20 @@ static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
switch (q->queue_type) { switch (q->queue_type) {
case QUEUE_TYPE_EXT: case QUEUE_TYPE_EXT:
rc = ext_hw_queue_init(hdev, q); rc = ext_queue_init(hdev, q);
break; break;
case QUEUE_TYPE_INT: case QUEUE_TYPE_INT:
rc = int_hw_queue_init(hdev, q); rc = int_queue_init(hdev, q);
break; break;
case QUEUE_TYPE_CPU: case QUEUE_TYPE_CPU:
rc = cpu_hw_queue_init(hdev, q); rc = cpu_queue_init(hdev, q);
break;
case QUEUE_TYPE_HW:
rc = hw_queue_init(hdev, q);
break; break;
case QUEUE_TYPE_NA: case QUEUE_TYPE_NA:
q->valid = 0; q->valid = 0;
return 0; return 0;
default: default:
dev_crit(hdev->dev, "wrong queue type %d during init\n", dev_crit(hdev->dev, "wrong queue type %d during init\n",
q->queue_type); q->queue_type);
...@@ -554,7 +693,7 @@ static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q, ...@@ -554,7 +693,7 @@ static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
* *
* Free the queue memory * Free the queue memory
*/ */
static void hw_queue_fini(struct hl_device *hdev, struct hl_hw_queue *q) static void queue_fini(struct hl_device *hdev, struct hl_hw_queue *q)
{ {
if (!q->valid) if (!q->valid)
return; return;
...@@ -612,7 +751,7 @@ int hl_hw_queues_create(struct hl_device *hdev) ...@@ -612,7 +751,7 @@ int hl_hw_queues_create(struct hl_device *hdev)
i < HL_MAX_QUEUES ; i++, q_ready_cnt++, q++) { i < HL_MAX_QUEUES ; i++, q_ready_cnt++, q++) {
q->queue_type = asic->hw_queues_props[i].type; q->queue_type = asic->hw_queues_props[i].type;
rc = hw_queue_init(hdev, q, i); rc = queue_init(hdev, q, i);
if (rc) { if (rc) {
dev_err(hdev->dev, dev_err(hdev->dev,
"failed to initialize queue %d\n", i); "failed to initialize queue %d\n", i);
...@@ -624,7 +763,7 @@ int hl_hw_queues_create(struct hl_device *hdev) ...@@ -624,7 +763,7 @@ int hl_hw_queues_create(struct hl_device *hdev)
release_queues: release_queues:
for (i = 0, q = hdev->kernel_queues ; i < q_ready_cnt ; i++, q++) for (i = 0, q = hdev->kernel_queues ; i < q_ready_cnt ; i++, q++)
hw_queue_fini(hdev, q); queue_fini(hdev, q);
kfree(hdev->kernel_queues); kfree(hdev->kernel_queues);
...@@ -637,7 +776,7 @@ void hl_hw_queues_destroy(struct hl_device *hdev) ...@@ -637,7 +776,7 @@ void hl_hw_queues_destroy(struct hl_device *hdev)
int i; int i;
for (i = 0, q = hdev->kernel_queues ; i < HL_MAX_QUEUES ; i++, q++) for (i = 0, q = hdev->kernel_queues ; i < HL_MAX_QUEUES ; i++, q++)
hw_queue_fini(hdev, q); queue_fini(hdev, q);
kfree(hdev->kernel_queues); kfree(hdev->kernel_queues);
} }
......
...@@ -260,4 +260,6 @@ ...@@ -260,4 +260,6 @@
#define DMA_QM_3_GLBL_CFG1_DMA_STOP_SHIFT DMA_QM_0_GLBL_CFG1_DMA_STOP_SHIFT #define DMA_QM_3_GLBL_CFG1_DMA_STOP_SHIFT DMA_QM_0_GLBL_CFG1_DMA_STOP_SHIFT
#define DMA_QM_4_GLBL_CFG1_DMA_STOP_SHIFT DMA_QM_0_GLBL_CFG1_DMA_STOP_SHIFT #define DMA_QM_4_GLBL_CFG1_DMA_STOP_SHIFT DMA_QM_0_GLBL_CFG1_DMA_STOP_SHIFT
#define PSOC_ETR_AXICTL_PROTCTRLBIT1_SHIFT 1
#endif /* ASIC_REG_GOYA_MASKS_H_ */ #endif /* ASIC_REG_GOYA_MASKS_H_ */
...@@ -84,6 +84,7 @@ ...@@ -84,6 +84,7 @@
#include "tpc6_rtr_regs.h" #include "tpc6_rtr_regs.h"
#include "tpc7_nrtr_regs.h" #include "tpc7_nrtr_regs.h"
#include "tpc0_eml_cfg_regs.h" #include "tpc0_eml_cfg_regs.h"
#include "psoc_etr_regs.h"
#include "psoc_global_conf_masks.h" #include "psoc_global_conf_masks.h"
#include "dma_macro_masks.h" #include "dma_macro_masks.h"
......
/* SPDX-License-Identifier: GPL-2.0
*
* Copyright 2016-2018 HabanaLabs, Ltd.
* All Rights Reserved.
*
*/
/************************************
** This is an auto-generated file **
** DO NOT EDIT BELOW **
************************************/
#ifndef ASIC_REG_PSOC_ETR_REGS_H_
#define ASIC_REG_PSOC_ETR_REGS_H_
/*
*****************************************
* PSOC_ETR (Prototype: ETR)
*****************************************
*/
#define mmPSOC_ETR_RSZ 0x2C43004
#define mmPSOC_ETR_STS 0x2C4300C
#define mmPSOC_ETR_RRD 0x2C43010
#define mmPSOC_ETR_RRP 0x2C43014
#define mmPSOC_ETR_RWP 0x2C43018
#define mmPSOC_ETR_TRG 0x2C4301C
#define mmPSOC_ETR_CTL 0x2C43020
#define mmPSOC_ETR_RWD 0x2C43024
#define mmPSOC_ETR_MODE 0x2C43028
#define mmPSOC_ETR_LBUFLEVEL 0x2C4302C
#define mmPSOC_ETR_CBUFLEVEL 0x2C43030
#define mmPSOC_ETR_BUFWM 0x2C43034
#define mmPSOC_ETR_RRPHI 0x2C43038
#define mmPSOC_ETR_RWPHI 0x2C4303C
#define mmPSOC_ETR_AXICTL 0x2C43110
#define mmPSOC_ETR_DBALO 0x2C43118
#define mmPSOC_ETR_DBAHI 0x2C4311C
#define mmPSOC_ETR_FFSR 0x2C43300
#define mmPSOC_ETR_FFCR 0x2C43304
#define mmPSOC_ETR_PSCR 0x2C43308
#define mmPSOC_ETR_ITMISCOP0 0x2C43EE0
#define mmPSOC_ETR_ITTRFLIN 0x2C43EE8
#define mmPSOC_ETR_ITATBDATA0 0x2C43EEC
#define mmPSOC_ETR_ITATBCTR2 0x2C43EF0
#define mmPSOC_ETR_ITATBCTR1 0x2C43EF4
#define mmPSOC_ETR_ITATBCTR0 0x2C43EF8
#define mmPSOC_ETR_ITCTRL 0x2C43F00
#define mmPSOC_ETR_CLAIMSET 0x2C43FA0
#define mmPSOC_ETR_CLAIMCLR 0x2C43FA4
#define mmPSOC_ETR_LAR 0x2C43FB0
#define mmPSOC_ETR_LSR 0x2C43FB4
#define mmPSOC_ETR_AUTHSTATUS 0x2C43FB8
#define mmPSOC_ETR_DEVID 0x2C43FC8
#define mmPSOC_ETR_DEVTYPE 0x2C43FCC
#define mmPSOC_ETR_PERIPHID4 0x2C43FD0
#define mmPSOC_ETR_PERIPHID5 0x2C43FD4
#define mmPSOC_ETR_PERIPHID6 0x2C43FD8
#define mmPSOC_ETR_PERIPHID7 0x2C43FDC
#define mmPSOC_ETR_PERIPHID0 0x2C43FE0
#define mmPSOC_ETR_PERIPHID1 0x2C43FE4
#define mmPSOC_ETR_PERIPHID2 0x2C43FE8
#define mmPSOC_ETR_PERIPHID3 0x2C43FEC
#define mmPSOC_ETR_COMPID0 0x2C43FF0
#define mmPSOC_ETR_COMPID1 0x2C43FF4
#define mmPSOC_ETR_COMPID2 0x2C43FF8
#define mmPSOC_ETR_COMPID3 0x2C43FFC
#endif /* ASIC_REG_PSOC_ETR_REGS_H_ */
...@@ -20,6 +20,8 @@ enum cpu_boot_status { ...@@ -20,6 +20,8 @@ enum cpu_boot_status {
CPU_BOOT_STATUS_DRAM_INIT_FAIL, CPU_BOOT_STATUS_DRAM_INIT_FAIL,
CPU_BOOT_STATUS_FIT_CORRUPTED, CPU_BOOT_STATUS_FIT_CORRUPTED,
CPU_BOOT_STATUS_UBOOT_NOT_READY, CPU_BOOT_STATUS_UBOOT_NOT_READY,
CPU_BOOT_STATUS_RESERVED,
CPU_BOOT_STATUS_TS_INIT_FAIL,
}; };
enum kmd_msg { enum kmd_msg {
......
...@@ -12,18 +12,16 @@ ...@@ -12,18 +12,16 @@
#define PAGE_SHIFT_2MB 21 #define PAGE_SHIFT_2MB 21
#define PAGE_SIZE_2MB (_AC(1, UL) << PAGE_SHIFT_2MB) #define PAGE_SIZE_2MB (_AC(1, UL) << PAGE_SHIFT_2MB)
#define PAGE_SIZE_4KB (_AC(1, UL) << PAGE_SHIFT_4KB) #define PAGE_SIZE_4KB (_AC(1, UL) << PAGE_SHIFT_4KB)
#define PAGE_MASK_2MB (~(PAGE_SIZE_2MB - 1))
#define PAGE_PRESENT_MASK 0x0000000000001ull #define PAGE_PRESENT_MASK 0x0000000000001ull
#define SWAP_OUT_MASK 0x0000000000004ull #define SWAP_OUT_MASK 0x0000000000004ull
#define LAST_MASK 0x0000000000800ull #define LAST_MASK 0x0000000000800ull
#define PHYS_ADDR_MASK 0xFFFFFFFFFFFFF000ull
#define HOP0_MASK 0x3000000000000ull #define HOP0_MASK 0x3000000000000ull
#define HOP1_MASK 0x0FF8000000000ull #define HOP1_MASK 0x0FF8000000000ull
#define HOP2_MASK 0x0007FC0000000ull #define HOP2_MASK 0x0007FC0000000ull
#define HOP3_MASK 0x000003FE00000ull #define HOP3_MASK 0x000003FE00000ull
#define HOP4_MASK 0x00000001FF000ull #define HOP4_MASK 0x00000001FF000ull
#define OFFSET_MASK 0x0000000000FFFull #define FLAGS_MASK 0x0000000000FFFull
#define HOP0_SHIFT 48 #define HOP0_SHIFT 48
#define HOP1_SHIFT 39 #define HOP1_SHIFT 39
...@@ -31,8 +29,7 @@ ...@@ -31,8 +29,7 @@
#define HOP3_SHIFT 21 #define HOP3_SHIFT 21
#define HOP4_SHIFT 12 #define HOP4_SHIFT 12
#define PTE_PHYS_ADDR_SHIFT 12 #define HOP_PHYS_ADDR_MASK (~FLAGS_MASK)
#define PTE_PHYS_ADDR_MASK ~OFFSET_MASK
#define HL_PTE_SIZE sizeof(u64) #define HL_PTE_SIZE sizeof(u64)
#define HOP_TABLE_SIZE PAGE_SIZE_4KB #define HOP_TABLE_SIZE PAGE_SIZE_4KB
......
...@@ -23,6 +23,8 @@ struct hl_bd { ...@@ -23,6 +23,8 @@ struct hl_bd {
#define HL_BD_SIZE sizeof(struct hl_bd) #define HL_BD_SIZE sizeof(struct hl_bd)
/* /*
* S/W CTL FIELDS.
*
* BD_CTL_REPEAT_VALID tells the CP whether the repeat field in the BD CTL is * BD_CTL_REPEAT_VALID tells the CP whether the repeat field in the BD CTL is
* valid. 1 means the repeat field is valid, 0 means not-valid, * valid. 1 means the repeat field is valid, 0 means not-valid,
* i.e. repeat == 1 * i.e. repeat == 1
...@@ -33,6 +35,16 @@ struct hl_bd { ...@@ -33,6 +35,16 @@ struct hl_bd {
#define BD_CTL_SHADOW_INDEX_SHIFT 0 #define BD_CTL_SHADOW_INDEX_SHIFT 0
#define BD_CTL_SHADOW_INDEX_MASK 0x00000FFF #define BD_CTL_SHADOW_INDEX_MASK 0x00000FFF
/*
* H/W CTL FIELDS
*/
#define BD_CTL_COMP_OFFSET_SHIFT 16
#define BD_CTL_COMP_OFFSET_MASK 0x00FF0000
#define BD_CTL_COMP_DATA_SHIFT 0
#define BD_CTL_COMP_DATA_MASK 0x0000FFFF
/* /*
* COMPLETION QUEUE * COMPLETION QUEUE
*/ */
......
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/genalloc.h> #include <linux/genalloc.h>
#define PGS_IN_2MB_PAGE (PAGE_SIZE_2MB >> PAGE_SHIFT)
#define HL_MMU_DEBUG 0 #define HL_MMU_DEBUG 0
/* /*
...@@ -159,20 +158,19 @@ static int alloc_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args, ...@@ -159,20 +158,19 @@ static int alloc_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args,
} }
/* /*
* get_userptr_from_host_va - initialize userptr structure from given host * dma_map_host_va - DMA mapping of the given host virtual address.
* virtual address * @hdev: habanalabs device structure
* * @addr: the host virtual address of the memory area
* @hdev : habanalabs device structure * @size: the size of the memory area
* @args : parameters containing the virtual address and size * @p_userptr: pointer to result userptr structure
* @p_userptr : pointer to result userptr structure
* *
* This function does the following: * This function does the following:
* - Allocate userptr structure * - Allocate userptr structure
* - Pin the given host memory using the userptr structure * - Pin the given host memory using the userptr structure
* - Perform DMA mapping to have the DMA addresses of the pages * - Perform DMA mapping to have the DMA addresses of the pages
*/ */
static int get_userptr_from_host_va(struct hl_device *hdev, static int dma_map_host_va(struct hl_device *hdev, u64 addr, u64 size,
struct hl_mem_in *args, struct hl_userptr **p_userptr) struct hl_userptr **p_userptr)
{ {
struct hl_userptr *userptr; struct hl_userptr *userptr;
int rc; int rc;
...@@ -183,8 +181,7 @@ static int get_userptr_from_host_va(struct hl_device *hdev, ...@@ -183,8 +181,7 @@ static int get_userptr_from_host_va(struct hl_device *hdev,
goto userptr_err; goto userptr_err;
} }
rc = hl_pin_host_memory(hdev, args->map_host.host_virt_addr, rc = hl_pin_host_memory(hdev, addr, size, userptr);
args->map_host.mem_size, userptr);
if (rc) { if (rc) {
dev_err(hdev->dev, "Failed to pin host memory\n"); dev_err(hdev->dev, "Failed to pin host memory\n");
goto pin_err; goto pin_err;
...@@ -215,16 +212,16 @@ static int get_userptr_from_host_va(struct hl_device *hdev, ...@@ -215,16 +212,16 @@ static int get_userptr_from_host_va(struct hl_device *hdev,
} }
/* /*
* free_userptr - free userptr structure * dma_unmap_host_va - DMA unmapping of the given host virtual address.
* * @hdev: habanalabs device structure
* @hdev : habanalabs device structure * @userptr: userptr to free
* @userptr : userptr to free
* *
* This function does the following: * This function does the following:
* - Unpins the physical pages * - Unpins the physical pages
* - Frees the userptr structure * - Frees the userptr structure
*/ */
static void free_userptr(struct hl_device *hdev, struct hl_userptr *userptr) static void dma_unmap_host_va(struct hl_device *hdev,
struct hl_userptr *userptr)
{ {
hl_unpin_host_memory(hdev, userptr); hl_unpin_host_memory(hdev, userptr);
kfree(userptr); kfree(userptr);
...@@ -254,9 +251,8 @@ static void dram_pg_pool_do_release(struct kref *ref) ...@@ -254,9 +251,8 @@ static void dram_pg_pool_do_release(struct kref *ref)
/* /*
* free_phys_pg_pack - free physical page pack * free_phys_pg_pack - free physical page pack
* * @hdev: habanalabs device structure
* @hdev : habanalabs device structure * @phys_pg_pack: physical page pack to free
* @phys_pg_pack : physical page pack to free
* *
* This function does the following: * This function does the following:
* - For DRAM memory only, iterate over the pack and free each physical block * - For DRAM memory only, iterate over the pack and free each physical block
...@@ -528,18 +524,17 @@ static u64 get_va_block(struct hl_device *hdev, ...@@ -528,18 +524,17 @@ static u64 get_va_block(struct hl_device *hdev,
u32 page_size; u32 page_size;
bool add_prev = false; bool add_prev = false;
if (is_userptr) { if (is_userptr)
/* /*
* We cannot know if the user allocated memory with huge pages * We cannot know if the user allocated memory with huge pages
* or not, hence we continue with the biggest possible * or not, hence we continue with the biggest possible
* granularity. * granularity.
*/ */
page_size = PAGE_SIZE_2MB; page_size = hdev->asic_prop.pmmu.huge_page_size;
page_mask = PAGE_MASK_2MB; else
} else { page_size = hdev->asic_prop.dmmu.page_size;
page_size = hdev->asic_prop.dram_page_size;
page_mask = ~((u64)page_size - 1); page_mask = ~((u64)page_size - 1);
}
mutex_lock(&va_range->lock); mutex_lock(&va_range->lock);
...@@ -549,7 +544,6 @@ static u64 get_va_block(struct hl_device *hdev, ...@@ -549,7 +544,6 @@ static u64 get_va_block(struct hl_device *hdev,
/* calc the first possible aligned addr */ /* calc the first possible aligned addr */
valid_start = va_block->start; valid_start = va_block->start;
if (valid_start & (page_size - 1)) { if (valid_start & (page_size - 1)) {
valid_start &= page_mask; valid_start &= page_mask;
valid_start += page_size; valid_start += page_size;
...@@ -561,7 +555,6 @@ static u64 get_va_block(struct hl_device *hdev, ...@@ -561,7 +555,6 @@ static u64 get_va_block(struct hl_device *hdev,
if (valid_size >= size && if (valid_size >= size &&
(!new_va_block || valid_size < res_valid_size)) { (!new_va_block || valid_size < res_valid_size)) {
new_va_block = va_block; new_va_block = va_block;
res_valid_start = valid_start; res_valid_start = valid_start;
res_valid_size = valid_size; res_valid_size = valid_size;
...@@ -632,10 +625,9 @@ static u32 get_sg_info(struct scatterlist *sg, dma_addr_t *dma_addr) ...@@ -632,10 +625,9 @@ static u32 get_sg_info(struct scatterlist *sg, dma_addr_t *dma_addr)
/* /*
* init_phys_pg_pack_from_userptr - initialize physical page pack from host * init_phys_pg_pack_from_userptr - initialize physical page pack from host
* memory * memory
* * @ctx: current context
* @ctx : current context * @userptr: userptr to initialize from
* @userptr : userptr to initialize from * @pphys_pg_pack: result pointer
* @pphys_pg_pack : res pointer
* *
* This function does the following: * This function does the following:
* - Pin the physical pages related to the given virtual block * - Pin the physical pages related to the given virtual block
...@@ -646,13 +638,16 @@ static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx, ...@@ -646,13 +638,16 @@ static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx,
struct hl_userptr *userptr, struct hl_userptr *userptr,
struct hl_vm_phys_pg_pack **pphys_pg_pack) struct hl_vm_phys_pg_pack **pphys_pg_pack)
{ {
struct hl_mmu_properties *mmu_prop = &ctx->hdev->asic_prop.pmmu;
struct hl_vm_phys_pg_pack *phys_pg_pack; struct hl_vm_phys_pg_pack *phys_pg_pack;
struct scatterlist *sg; struct scatterlist *sg;
dma_addr_t dma_addr; dma_addr_t dma_addr;
u64 page_mask, total_npages; u64 page_mask, total_npages;
u32 npages, page_size = PAGE_SIZE; u32 npages, page_size = PAGE_SIZE,
huge_page_size = mmu_prop->huge_page_size;
bool first = true, is_huge_page_opt = true; bool first = true, is_huge_page_opt = true;
int rc, i, j; int rc, i, j;
u32 pgs_in_huge_page = huge_page_size >> __ffs(page_size);
phys_pg_pack = kzalloc(sizeof(*phys_pg_pack), GFP_KERNEL); phys_pg_pack = kzalloc(sizeof(*phys_pg_pack), GFP_KERNEL);
if (!phys_pg_pack) if (!phys_pg_pack)
...@@ -675,14 +670,14 @@ static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx, ...@@ -675,14 +670,14 @@ static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx,
total_npages += npages; total_npages += npages;
if ((npages % PGS_IN_2MB_PAGE) || if ((npages % pgs_in_huge_page) ||
(dma_addr & (PAGE_SIZE_2MB - 1))) (dma_addr & (huge_page_size - 1)))
is_huge_page_opt = false; is_huge_page_opt = false;
} }
if (is_huge_page_opt) { if (is_huge_page_opt) {
page_size = PAGE_SIZE_2MB; page_size = huge_page_size;
total_npages /= PGS_IN_2MB_PAGE; do_div(total_npages, pgs_in_huge_page);
} }
page_mask = ~(((u64) page_size) - 1); page_mask = ~(((u64) page_size) - 1);
...@@ -714,7 +709,7 @@ static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx, ...@@ -714,7 +709,7 @@ static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx,
dma_addr += page_size; dma_addr += page_size;
if (is_huge_page_opt) if (is_huge_page_opt)
npages -= PGS_IN_2MB_PAGE; npages -= pgs_in_huge_page;
else else
npages--; npages--;
} }
...@@ -731,18 +726,17 @@ static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx, ...@@ -731,18 +726,17 @@ static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx,
} }
/* /*
* map_phys_page_pack - maps the physical page pack * map_phys_pg_pack - maps the physical page pack.
* * @ctx: current context
* @ctx : current context * @vaddr: start address of the virtual area to map from
* @vaddr : start address of the virtual area to map from * @phys_pg_pack: the pack of physical pages to map to
* @phys_pg_pack : the pack of physical pages to map to
* *
* This function does the following: * This function does the following:
* - Maps each chunk of virtual memory to matching physical chunk * - Maps each chunk of virtual memory to matching physical chunk
* - Stores number of successful mappings in the given argument * - Stores number of successful mappings in the given argument
* - Returns 0 on success, error code otherwise. * - Returns 0 on success, error code otherwise
*/ */
static int map_phys_page_pack(struct hl_ctx *ctx, u64 vaddr, static int map_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
struct hl_vm_phys_pg_pack *phys_pg_pack) struct hl_vm_phys_pg_pack *phys_pg_pack)
{ {
struct hl_device *hdev = ctx->hdev; struct hl_device *hdev = ctx->hdev;
...@@ -783,6 +777,36 @@ static int map_phys_page_pack(struct hl_ctx *ctx, u64 vaddr, ...@@ -783,6 +777,36 @@ static int map_phys_page_pack(struct hl_ctx *ctx, u64 vaddr,
return rc; return rc;
} }
/*
* unmap_phys_pg_pack - unmaps the physical page pack
* @ctx: current context
* @vaddr: start address of the virtual area to unmap
* @phys_pg_pack: the pack of physical pages to unmap
*/
static void unmap_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
struct hl_vm_phys_pg_pack *phys_pg_pack)
{
struct hl_device *hdev = ctx->hdev;
u64 next_vaddr, i;
u32 page_size;
page_size = phys_pg_pack->page_size;
next_vaddr = vaddr;
for (i = 0 ; i < phys_pg_pack->npages ; i++, next_vaddr += page_size) {
if (hl_mmu_unmap(ctx, next_vaddr, page_size))
dev_warn_ratelimited(hdev->dev,
"unmap failed for vaddr: 0x%llx\n", next_vaddr);
/*
* unmapping on Palladium can be really long, so avoid a CPU
* soft lockup bug by sleeping a little between unmapping pages
*/
if (hdev->pldm)
usleep_range(500, 1000);
}
}
static int get_paddr_from_handle(struct hl_ctx *ctx, struct hl_mem_in *args, static int get_paddr_from_handle(struct hl_ctx *ctx, struct hl_mem_in *args,
u64 *paddr) u64 *paddr)
{ {
...@@ -839,7 +863,10 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, ...@@ -839,7 +863,10 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
*device_addr = 0; *device_addr = 0;
if (is_userptr) { if (is_userptr) {
rc = get_userptr_from_host_va(hdev, args, &userptr); u64 addr = args->map_host.host_virt_addr,
size = args->map_host.mem_size;
rc = dma_map_host_va(hdev, addr, size, &userptr);
if (rc) { if (rc) {
dev_err(hdev->dev, "failed to get userptr from va\n"); dev_err(hdev->dev, "failed to get userptr from va\n");
return rc; return rc;
...@@ -850,7 +877,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, ...@@ -850,7 +877,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
if (rc) { if (rc) {
dev_err(hdev->dev, dev_err(hdev->dev,
"unable to init page pack for vaddr 0x%llx\n", "unable to init page pack for vaddr 0x%llx\n",
args->map_host.host_virt_addr); addr);
goto init_page_pack_err; goto init_page_pack_err;
} }
...@@ -909,7 +936,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, ...@@ -909,7 +936,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
mutex_lock(&ctx->mmu_lock); mutex_lock(&ctx->mmu_lock);
rc = map_phys_page_pack(ctx, ret_vaddr, phys_pg_pack); rc = map_phys_pg_pack(ctx, ret_vaddr, phys_pg_pack);
if (rc) { if (rc) {
mutex_unlock(&ctx->mmu_lock); mutex_unlock(&ctx->mmu_lock);
dev_err(hdev->dev, "mapping page pack failed for handle %u\n", dev_err(hdev->dev, "mapping page pack failed for handle %u\n",
...@@ -917,7 +944,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, ...@@ -917,7 +944,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
goto map_err; goto map_err;
} }
hdev->asic_funcs->mmu_invalidate_cache(hdev, false); hdev->asic_funcs->mmu_invalidate_cache(hdev, false, *vm_type);
mutex_unlock(&ctx->mmu_lock); mutex_unlock(&ctx->mmu_lock);
...@@ -955,7 +982,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, ...@@ -955,7 +982,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
free_phys_pg_pack(hdev, phys_pg_pack); free_phys_pg_pack(hdev, phys_pg_pack);
init_page_pack_err: init_page_pack_err:
if (is_userptr) if (is_userptr)
free_userptr(hdev, userptr); dma_unmap_host_va(hdev, userptr);
return rc; return rc;
} }
...@@ -965,20 +992,20 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, ...@@ -965,20 +992,20 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
* *
* @ctx : current context * @ctx : current context
* @vaddr : device virtual address to unmap * @vaddr : device virtual address to unmap
* @ctx_free : true if in context free flow, false otherwise.
* *
* This function does the following: * This function does the following:
* - Unmap the physical pages related to the given virtual address * - Unmap the physical pages related to the given virtual address
* - return the device virtual block to the virtual block list * - return the device virtual block to the virtual block list
*/ */
static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr) static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr, bool ctx_free)
{ {
struct hl_device *hdev = ctx->hdev; struct hl_device *hdev = ctx->hdev;
struct hl_vm_phys_pg_pack *phys_pg_pack = NULL; struct hl_vm_phys_pg_pack *phys_pg_pack = NULL;
struct hl_vm_hash_node *hnode = NULL; struct hl_vm_hash_node *hnode = NULL;
struct hl_userptr *userptr = NULL; struct hl_userptr *userptr = NULL;
struct hl_va_range *va_range;
enum vm_type_t *vm_type; enum vm_type_t *vm_type;
u64 next_vaddr, i;
u32 page_size;
bool is_userptr; bool is_userptr;
int rc; int rc;
...@@ -1003,6 +1030,7 @@ static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr) ...@@ -1003,6 +1030,7 @@ static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr)
if (*vm_type == VM_TYPE_USERPTR) { if (*vm_type == VM_TYPE_USERPTR) {
is_userptr = true; is_userptr = true;
va_range = &ctx->host_va_range;
userptr = hnode->ptr; userptr = hnode->ptr;
rc = init_phys_pg_pack_from_userptr(ctx, userptr, rc = init_phys_pg_pack_from_userptr(ctx, userptr,
&phys_pg_pack); &phys_pg_pack);
...@@ -1014,6 +1042,7 @@ static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr) ...@@ -1014,6 +1042,7 @@ static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr)
} }
} else if (*vm_type == VM_TYPE_PHYS_PACK) { } else if (*vm_type == VM_TYPE_PHYS_PACK) {
is_userptr = false; is_userptr = false;
va_range = &ctx->dram_va_range;
phys_pg_pack = hnode->ptr; phys_pg_pack = hnode->ptr;
} else { } else {
dev_warn(hdev->dev, dev_warn(hdev->dev,
...@@ -1029,42 +1058,41 @@ static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr) ...@@ -1029,42 +1058,41 @@ static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr)
goto mapping_cnt_err; goto mapping_cnt_err;
} }
page_size = phys_pg_pack->page_size; vaddr &= ~(((u64) phys_pg_pack->page_size) - 1);
vaddr &= ~(((u64) page_size) - 1);
next_vaddr = vaddr;
mutex_lock(&ctx->mmu_lock); mutex_lock(&ctx->mmu_lock);
for (i = 0 ; i < phys_pg_pack->npages ; i++, next_vaddr += page_size) { unmap_phys_pg_pack(ctx, vaddr, phys_pg_pack);
if (hl_mmu_unmap(ctx, next_vaddr, page_size))
dev_warn_ratelimited(hdev->dev,
"unmap failed for vaddr: 0x%llx\n", next_vaddr);
/* unmapping on Palladium can be really long, so avoid a CPU /*
* soft lockup bug by sleeping a little between unmapping pages * During context free this function is called in a loop to clean all
* the context mappings. Hence the cache invalidation can be called once
* at the loop end rather than for each iteration
*/ */
if (hdev->pldm) if (!ctx_free)
usleep_range(500, 1000); hdev->asic_funcs->mmu_invalidate_cache(hdev, true, *vm_type);
}
hdev->asic_funcs->mmu_invalidate_cache(hdev, true);
mutex_unlock(&ctx->mmu_lock); mutex_unlock(&ctx->mmu_lock);
if (add_va_block(hdev, /*
is_userptr ? &ctx->host_va_range : &ctx->dram_va_range, * No point in maintaining the free VA block list if the context is
vaddr, * closing as the list will be freed anyway
vaddr + phys_pg_pack->total_size - 1)) */
dev_warn(hdev->dev, "add va block failed for vaddr: 0x%llx\n", if (!ctx_free) {
rc = add_va_block(hdev, va_range, vaddr,
vaddr + phys_pg_pack->total_size - 1);
if (rc)
dev_warn(hdev->dev,
"add va block failed for vaddr: 0x%llx\n",
vaddr); vaddr);
}
atomic_dec(&phys_pg_pack->mapping_cnt); atomic_dec(&phys_pg_pack->mapping_cnt);
kfree(hnode); kfree(hnode);
if (is_userptr) { if (is_userptr) {
free_phys_pg_pack(hdev, phys_pg_pack); free_phys_pg_pack(hdev, phys_pg_pack);
free_userptr(hdev, userptr); dma_unmap_host_va(hdev, userptr);
} }
return 0; return 0;
...@@ -1189,8 +1217,8 @@ int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data) ...@@ -1189,8 +1217,8 @@ int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data)
break; break;
case HL_MEM_OP_UNMAP: case HL_MEM_OP_UNMAP:
rc = unmap_device_va(ctx, rc = unmap_device_va(ctx, args->in.unmap.device_virt_addr,
args->in.unmap.device_virt_addr); false);
break; break;
default: default:
...@@ -1203,17 +1231,69 @@ int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data) ...@@ -1203,17 +1231,69 @@ int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data)
return rc; return rc;
} }
static int get_user_memory(struct hl_device *hdev, u64 addr, u64 size,
u32 npages, u64 start, u32 offset,
struct hl_userptr *userptr)
{
int rc;
if (!access_ok((void __user *) (uintptr_t) addr, size)) {
dev_err(hdev->dev, "user pointer is invalid - 0x%llx\n", addr);
return -EFAULT;
}
userptr->vec = frame_vector_create(npages);
if (!userptr->vec) {
dev_err(hdev->dev, "Failed to create frame vector\n");
return -ENOMEM;
}
rc = get_vaddr_frames(start, npages, FOLL_FORCE | FOLL_WRITE,
userptr->vec);
if (rc != npages) {
dev_err(hdev->dev,
"Failed to map host memory, user ptr probably wrong\n");
if (rc < 0)
goto destroy_framevec;
rc = -EFAULT;
goto put_framevec;
}
if (frame_vector_to_pages(userptr->vec) < 0) {
dev_err(hdev->dev,
"Failed to translate frame vector to pages\n");
rc = -EFAULT;
goto put_framevec;
}
rc = sg_alloc_table_from_pages(userptr->sgt,
frame_vector_pages(userptr->vec),
npages, offset, size, GFP_ATOMIC);
if (rc < 0) {
dev_err(hdev->dev, "failed to create SG table from pages\n");
goto put_framevec;
}
return 0;
put_framevec:
put_vaddr_frames(userptr->vec);
destroy_framevec:
frame_vector_destroy(userptr->vec);
return rc;
}
/* /*
* hl_pin_host_memory - pins a chunk of host memory * hl_pin_host_memory - pins a chunk of host memory.
* * @hdev: pointer to the habanalabs device structure
* @hdev : pointer to the habanalabs device structure * @addr: the host virtual address of the memory area
* @addr : the user-space virtual address of the memory area * @size: the size of the memory area
* @size : the size of the memory area * @userptr: pointer to hl_userptr structure
* @userptr : pointer to hl_userptr structure
* *
* This function does the following: * This function does the following:
* - Pins the physical pages * - Pins the physical pages
* - Create a SG list from those pages * - Create an SG list from those pages
*/ */
int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size, int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size,
struct hl_userptr *userptr) struct hl_userptr *userptr)
...@@ -1227,11 +1307,6 @@ int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size, ...@@ -1227,11 +1307,6 @@ int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size,
return -EINVAL; return -EINVAL;
} }
if (!access_ok((void __user *) (uintptr_t) addr, size)) {
dev_err(hdev->dev, "user pointer is invalid - 0x%llx\n", addr);
return -EFAULT;
}
/* /*
* If the combination of the address and size requested for this memory * If the combination of the address and size requested for this memory
* region causes an integer overflow, return error. * region causes an integer overflow, return error.
...@@ -1244,6 +1319,14 @@ int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size, ...@@ -1244,6 +1319,14 @@ int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size,
return -EINVAL; return -EINVAL;
} }
/*
* This function can be called also from data path, hence use atomic
* always as it is not a big allocation.
*/
userptr->sgt = kzalloc(sizeof(*userptr->sgt), GFP_ATOMIC);
if (!userptr->sgt)
return -ENOMEM;
start = addr & PAGE_MASK; start = addr & PAGE_MASK;
offset = addr & ~PAGE_MASK; offset = addr & ~PAGE_MASK;
end = PAGE_ALIGN(addr + size); end = PAGE_ALIGN(addr + size);
...@@ -1254,42 +1337,12 @@ int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size, ...@@ -1254,42 +1337,12 @@ int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size,
userptr->dma_mapped = false; userptr->dma_mapped = false;
INIT_LIST_HEAD(&userptr->job_node); INIT_LIST_HEAD(&userptr->job_node);
userptr->vec = frame_vector_create(npages); rc = get_user_memory(hdev, addr, size, npages, start, offset,
if (!userptr->vec) { userptr);
dev_err(hdev->dev, "Failed to create frame vector\n"); if (rc) {
return -ENOMEM;
}
rc = get_vaddr_frames(start, npages, FOLL_FORCE | FOLL_WRITE,
userptr->vec);
if (rc != npages) {
dev_err(hdev->dev,
"Failed to map host memory, user ptr probably wrong\n");
if (rc < 0)
goto destroy_framevec;
rc = -EFAULT;
goto put_framevec;
}
if (frame_vector_to_pages(userptr->vec) < 0) {
dev_err(hdev->dev, dev_err(hdev->dev,
"Failed to translate frame vector to pages\n"); "failed to get user memory for address 0x%llx\n",
rc = -EFAULT; addr);
goto put_framevec;
}
userptr->sgt = kzalloc(sizeof(*userptr->sgt), GFP_ATOMIC);
if (!userptr->sgt) {
rc = -ENOMEM;
goto put_framevec;
}
rc = sg_alloc_table_from_pages(userptr->sgt,
frame_vector_pages(userptr->vec),
npages, offset, size, GFP_ATOMIC);
if (rc < 0) {
dev_err(hdev->dev, "failed to create SG table from pages\n");
goto free_sgt; goto free_sgt;
} }
...@@ -1299,32 +1352,26 @@ int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size, ...@@ -1299,32 +1352,26 @@ int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size,
free_sgt: free_sgt:
kfree(userptr->sgt); kfree(userptr->sgt);
put_framevec:
put_vaddr_frames(userptr->vec);
destroy_framevec:
frame_vector_destroy(userptr->vec);
return rc; return rc;
} }
/* /*
* hl_unpin_host_memory - unpins a chunk of host memory * hl_unpin_host_memory - unpins a chunk of host memory.
* * @hdev: pointer to the habanalabs device structure
* @hdev : pointer to the habanalabs device structure * @userptr: pointer to hl_userptr structure
* @userptr : pointer to hl_userptr structure
* *
* This function does the following: * This function does the following:
* - Unpins the physical pages related to the host memory * - Unpins the physical pages related to the host memory
* - Free the SG list * - Free the SG list
*/ */
int hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr) void hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr)
{ {
struct page **pages; struct page **pages;
hl_debugfs_remove_userptr(hdev, userptr); hl_debugfs_remove_userptr(hdev, userptr);
if (userptr->dma_mapped) if (userptr->dma_mapped)
hdev->asic_funcs->hl_dma_unmap_sg(hdev, hdev->asic_funcs->hl_dma_unmap_sg(hdev, userptr->sgt->sgl,
userptr->sgt->sgl,
userptr->sgt->nents, userptr->sgt->nents,
userptr->dir); userptr->dir);
...@@ -1342,8 +1389,6 @@ int hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr) ...@@ -1342,8 +1389,6 @@ int hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr)
sg_free_table(userptr->sgt); sg_free_table(userptr->sgt);
kfree(userptr->sgt); kfree(userptr->sgt);
return 0;
} }
/* /*
...@@ -1542,43 +1587,16 @@ int hl_vm_ctx_init(struct hl_ctx *ctx) ...@@ -1542,43 +1587,16 @@ int hl_vm_ctx_init(struct hl_ctx *ctx)
* @hdev : pointer to the habanalabs structure * @hdev : pointer to the habanalabs structure
* va_range : pointer to virtual addresses range * va_range : pointer to virtual addresses range
* *
* This function initializes the following: * This function does the following:
* - Checks that the given range contains the whole initial range
* - Frees the virtual addresses block list and its lock * - Frees the virtual addresses block list and its lock
*/ */
static void hl_va_range_fini(struct hl_device *hdev, static void hl_va_range_fini(struct hl_device *hdev,
struct hl_va_range *va_range) struct hl_va_range *va_range)
{ {
struct hl_vm_va_block *va_block;
if (list_empty(&va_range->list)) {
dev_warn(hdev->dev,
"va list should not be empty on cleanup!\n");
goto out;
}
if (!list_is_singular(&va_range->list)) {
dev_warn(hdev->dev,
"va list should not contain multiple blocks on cleanup!\n");
goto free_va_list;
}
va_block = list_first_entry(&va_range->list, typeof(*va_block), node);
if (va_block->start != va_range->start_addr ||
va_block->end != va_range->end_addr) {
dev_warn(hdev->dev,
"wrong va block on cleanup, from 0x%llx to 0x%llx\n",
va_block->start, va_block->end);
goto free_va_list;
}
free_va_list:
mutex_lock(&va_range->lock); mutex_lock(&va_range->lock);
clear_va_list_locked(hdev, &va_range->list); clear_va_list_locked(hdev, &va_range->list);
mutex_unlock(&va_range->lock); mutex_unlock(&va_range->lock);
out:
mutex_destroy(&va_range->lock); mutex_destroy(&va_range->lock);
} }
...@@ -1613,21 +1631,31 @@ void hl_vm_ctx_fini(struct hl_ctx *ctx) ...@@ -1613,21 +1631,31 @@ void hl_vm_ctx_fini(struct hl_ctx *ctx)
hl_debugfs_remove_ctx_mem_hash(hdev, ctx); hl_debugfs_remove_ctx_mem_hash(hdev, ctx);
if (!hash_empty(ctx->mem_hash)) /*
dev_notice(hdev->dev, "ctx is freed while it has va in use\n"); * Clearly something went wrong on hard reset so no point in printing
* another side effect error
*/
if (!hdev->hard_reset_pending && !hash_empty(ctx->mem_hash))
dev_notice(hdev->dev,
"ctx %d is freed while it has va in use\n",
ctx->asid);
hash_for_each_safe(ctx->mem_hash, i, tmp_node, hnode, node) { hash_for_each_safe(ctx->mem_hash, i, tmp_node, hnode, node) {
dev_dbg(hdev->dev, dev_dbg(hdev->dev,
"hl_mem_hash_node of vaddr 0x%llx of asid %d is still alive\n", "hl_mem_hash_node of vaddr 0x%llx of asid %d is still alive\n",
hnode->vaddr, ctx->asid); hnode->vaddr, ctx->asid);
unmap_device_va(ctx, hnode->vaddr); unmap_device_va(ctx, hnode->vaddr, true);
} }
/* invalidate the cache once after the unmapping loop */
hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_USERPTR);
hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_PHYS_PACK);
spin_lock(&vm->idr_lock); spin_lock(&vm->idr_lock);
idr_for_each_entry(&vm->phys_pg_pack_handles, phys_pg_list, i) idr_for_each_entry(&vm->phys_pg_pack_handles, phys_pg_list, i)
if (phys_pg_list->asid == ctx->asid) { if (phys_pg_list->asid == ctx->asid) {
dev_dbg(hdev->dev, dev_dbg(hdev->dev,
"page list 0x%p of asid %d is still alive\n", "page list 0x%px of asid %d is still alive\n",
phys_pg_list, ctx->asid); phys_pg_list, ctx->asid);
atomic64_sub(phys_pg_list->total_size, atomic64_sub(phys_pg_list->total_size,
&hdev->dram_used_mem); &hdev->dram_used_mem);
......
...@@ -25,10 +25,9 @@ static struct pgt_info *get_pgt_info(struct hl_ctx *ctx, u64 hop_addr) ...@@ -25,10 +25,9 @@ static struct pgt_info *get_pgt_info(struct hl_ctx *ctx, u64 hop_addr)
return pgt_info; return pgt_info;
} }
static void free_hop(struct hl_ctx *ctx, u64 hop_addr) static void _free_hop(struct hl_ctx *ctx, struct pgt_info *pgt_info)
{ {
struct hl_device *hdev = ctx->hdev; struct hl_device *hdev = ctx->hdev;
struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr);
gen_pool_free(hdev->mmu_pgt_pool, pgt_info->phys_addr, gen_pool_free(hdev->mmu_pgt_pool, pgt_info->phys_addr,
hdev->asic_prop.mmu_hop_table_size); hdev->asic_prop.mmu_hop_table_size);
...@@ -37,6 +36,13 @@ static void free_hop(struct hl_ctx *ctx, u64 hop_addr) ...@@ -37,6 +36,13 @@ static void free_hop(struct hl_ctx *ctx, u64 hop_addr)
kfree(pgt_info); kfree(pgt_info);
} }
static void free_hop(struct hl_ctx *ctx, u64 hop_addr)
{
struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr);
_free_hop(ctx, pgt_info);
}
static u64 alloc_hop(struct hl_ctx *ctx) static u64 alloc_hop(struct hl_ctx *ctx)
{ {
struct hl_device *hdev = ctx->hdev; struct hl_device *hdev = ctx->hdev;
...@@ -105,8 +111,8 @@ static inline void write_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val) ...@@ -105,8 +111,8 @@ static inline void write_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val)
* clear the 12 LSBs and translate the shadow hop to its associated * clear the 12 LSBs and translate the shadow hop to its associated
* physical hop, and add back the original 12 LSBs. * physical hop, and add back the original 12 LSBs.
*/ */
u64 phys_val = get_phys_addr(ctx, val & PTE_PHYS_ADDR_MASK) | u64 phys_val = get_phys_addr(ctx, val & HOP_PHYS_ADDR_MASK) |
(val & OFFSET_MASK); (val & FLAGS_MASK);
ctx->hdev->asic_funcs->write_pte(ctx->hdev, ctx->hdev->asic_funcs->write_pte(ctx->hdev,
get_phys_addr(ctx, shadow_pte_addr), get_phys_addr(ctx, shadow_pte_addr),
...@@ -159,7 +165,7 @@ static inline int put_pte(struct hl_ctx *ctx, u64 hop_addr) ...@@ -159,7 +165,7 @@ static inline int put_pte(struct hl_ctx *ctx, u64 hop_addr)
*/ */
num_of_ptes_left = pgt_info->num_of_ptes; num_of_ptes_left = pgt_info->num_of_ptes;
if (!num_of_ptes_left) if (!num_of_ptes_left)
free_hop(ctx, hop_addr); _free_hop(ctx, pgt_info);
return num_of_ptes_left; return num_of_ptes_left;
} }
...@@ -171,35 +177,50 @@ static inline u64 get_hopN_pte_addr(struct hl_ctx *ctx, u64 hop_addr, ...@@ -171,35 +177,50 @@ static inline u64 get_hopN_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
((virt_addr & mask) >> shift); ((virt_addr & mask) >> shift);
} }
static inline u64 get_hop0_pte_addr(struct hl_ctx *ctx, u64 hop_addr, u64 vaddr) static inline u64 get_hop0_pte_addr(struct hl_ctx *ctx,
struct hl_mmu_properties *mmu_prop,
u64 hop_addr, u64 vaddr)
{ {
return get_hopN_pte_addr(ctx, hop_addr, vaddr, HOP0_MASK, HOP0_SHIFT); return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop0_mask,
mmu_prop->hop0_shift);
} }
static inline u64 get_hop1_pte_addr(struct hl_ctx *ctx, u64 hop_addr, u64 vaddr) static inline u64 get_hop1_pte_addr(struct hl_ctx *ctx,
struct hl_mmu_properties *mmu_prop,
u64 hop_addr, u64 vaddr)
{ {
return get_hopN_pte_addr(ctx, hop_addr, vaddr, HOP1_MASK, HOP1_SHIFT); return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop1_mask,
mmu_prop->hop1_shift);
} }
static inline u64 get_hop2_pte_addr(struct hl_ctx *ctx, u64 hop_addr, u64 vaddr) static inline u64 get_hop2_pte_addr(struct hl_ctx *ctx,
struct hl_mmu_properties *mmu_prop,
u64 hop_addr, u64 vaddr)
{ {
return get_hopN_pte_addr(ctx, hop_addr, vaddr, HOP2_MASK, HOP2_SHIFT); return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop2_mask,
mmu_prop->hop2_shift);
} }
static inline u64 get_hop3_pte_addr(struct hl_ctx *ctx, u64 hop_addr, u64 vaddr) static inline u64 get_hop3_pte_addr(struct hl_ctx *ctx,
struct hl_mmu_properties *mmu_prop,
u64 hop_addr, u64 vaddr)
{ {
return get_hopN_pte_addr(ctx, hop_addr, vaddr, HOP3_MASK, HOP3_SHIFT); return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop3_mask,
mmu_prop->hop3_shift);
} }
static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx, u64 hop_addr, u64 vaddr) static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx,
struct hl_mmu_properties *mmu_prop,
u64 hop_addr, u64 vaddr)
{ {
return get_hopN_pte_addr(ctx, hop_addr, vaddr, HOP4_MASK, HOP4_SHIFT); return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop4_mask,
mmu_prop->hop4_shift);
} }
static inline u64 get_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte) static inline u64 get_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte)
{ {
if (curr_pte & PAGE_PRESENT_MASK) if (curr_pte & PAGE_PRESENT_MASK)
return curr_pte & PHYS_ADDR_MASK; return curr_pte & HOP_PHYS_ADDR_MASK;
else else
return ULLONG_MAX; return ULLONG_MAX;
} }
...@@ -288,23 +309,23 @@ static int dram_default_mapping_init(struct hl_ctx *ctx) ...@@ -288,23 +309,23 @@ static int dram_default_mapping_init(struct hl_ctx *ctx)
} }
/* need only pte 0 in hops 0 and 1 */ /* need only pte 0 in hops 0 and 1 */
pte_val = (hop1_addr & PTE_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK; pte_val = (hop1_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
write_pte(ctx, hop0_addr, pte_val); write_pte(ctx, hop0_addr, pte_val);
pte_val = (hop2_addr & PTE_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK; pte_val = (hop2_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
write_pte(ctx, hop1_addr, pte_val); write_pte(ctx, hop1_addr, pte_val);
get_pte(ctx, hop1_addr); get_pte(ctx, hop1_addr);
hop2_pte_addr = hop2_addr; hop2_pte_addr = hop2_addr;
for (i = 0 ; i < num_of_hop3 ; i++) { for (i = 0 ; i < num_of_hop3 ; i++) {
pte_val = (ctx->dram_default_hops[i] & PTE_PHYS_ADDR_MASK) | pte_val = (ctx->dram_default_hops[i] & HOP_PHYS_ADDR_MASK) |
PAGE_PRESENT_MASK; PAGE_PRESENT_MASK;
write_pte(ctx, hop2_pte_addr, pte_val); write_pte(ctx, hop2_pte_addr, pte_val);
get_pte(ctx, hop2_addr); get_pte(ctx, hop2_addr);
hop2_pte_addr += HL_PTE_SIZE; hop2_pte_addr += HL_PTE_SIZE;
} }
pte_val = (prop->mmu_dram_default_page_addr & PTE_PHYS_ADDR_MASK) | pte_val = (prop->mmu_dram_default_page_addr & HOP_PHYS_ADDR_MASK) |
LAST_MASK | PAGE_PRESENT_MASK; LAST_MASK | PAGE_PRESENT_MASK;
for (i = 0 ; i < num_of_hop3 ; i++) { for (i = 0 ; i < num_of_hop3 ; i++) {
...@@ -400,8 +421,6 @@ int hl_mmu_init(struct hl_device *hdev) ...@@ -400,8 +421,6 @@ int hl_mmu_init(struct hl_device *hdev)
if (!hdev->mmu_enable) if (!hdev->mmu_enable)
return 0; return 0;
/* MMU H/W init was already done in device hw_init() */
hdev->mmu_pgt_pool = hdev->mmu_pgt_pool =
gen_pool_create(__ffs(prop->mmu_hop_table_size), -1); gen_pool_create(__ffs(prop->mmu_hop_table_size), -1);
...@@ -427,6 +446,8 @@ int hl_mmu_init(struct hl_device *hdev) ...@@ -427,6 +446,8 @@ int hl_mmu_init(struct hl_device *hdev)
goto err_pool_add; goto err_pool_add;
} }
/* MMU H/W init will be done in device hw_init() */
return 0; return 0;
err_pool_add: err_pool_add:
...@@ -450,10 +471,10 @@ void hl_mmu_fini(struct hl_device *hdev) ...@@ -450,10 +471,10 @@ void hl_mmu_fini(struct hl_device *hdev)
if (!hdev->mmu_enable) if (!hdev->mmu_enable)
return; return;
/* MMU H/W fini was already done in device hw_fini() */
kvfree(hdev->mmu_shadow_hop0); kvfree(hdev->mmu_shadow_hop0);
gen_pool_destroy(hdev->mmu_pgt_pool); gen_pool_destroy(hdev->mmu_pgt_pool);
/* MMU H/W fini will be done in device hw_fini() */
} }
/** /**
...@@ -501,36 +522,36 @@ void hl_mmu_ctx_fini(struct hl_ctx *ctx) ...@@ -501,36 +522,36 @@ void hl_mmu_ctx_fini(struct hl_ctx *ctx)
dram_default_mapping_fini(ctx); dram_default_mapping_fini(ctx);
if (!hash_empty(ctx->mmu_shadow_hash)) if (!hash_empty(ctx->mmu_shadow_hash))
dev_err(hdev->dev, "ctx is freed while it has pgts in use\n"); dev_err(hdev->dev, "ctx %d is freed while it has pgts in use\n",
ctx->asid);
hash_for_each_safe(ctx->mmu_shadow_hash, i, tmp, pgt_info, node) { hash_for_each_safe(ctx->mmu_shadow_hash, i, tmp, pgt_info, node) {
dev_err(hdev->dev, dev_err_ratelimited(hdev->dev,
"pgt_info of addr 0x%llx of asid %d was not destroyed, num_ptes: %d\n", "pgt_info of addr 0x%llx of asid %d was not destroyed, num_ptes: %d\n",
pgt_info->phys_addr, ctx->asid, pgt_info->num_of_ptes); pgt_info->phys_addr, ctx->asid, pgt_info->num_of_ptes);
free_hop(ctx, pgt_info->shadow_addr); _free_hop(ctx, pgt_info);
} }
mutex_destroy(&ctx->mmu_lock); mutex_destroy(&ctx->mmu_lock);
} }
static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr) static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, bool is_dram_addr)
{ {
struct hl_device *hdev = ctx->hdev; struct hl_device *hdev = ctx->hdev;
struct asic_fixed_properties *prop = &hdev->asic_prop; struct asic_fixed_properties *prop = &hdev->asic_prop;
struct hl_mmu_properties *mmu_prop;
u64 hop0_addr = 0, hop0_pte_addr = 0, u64 hop0_addr = 0, hop0_pte_addr = 0,
hop1_addr = 0, hop1_pte_addr = 0, hop1_addr = 0, hop1_pte_addr = 0,
hop2_addr = 0, hop2_pte_addr = 0, hop2_addr = 0, hop2_pte_addr = 0,
hop3_addr = 0, hop3_pte_addr = 0, hop3_addr = 0, hop3_pte_addr = 0,
hop4_addr = 0, hop4_pte_addr = 0, hop4_addr = 0, hop4_pte_addr = 0,
curr_pte; curr_pte;
bool is_dram_addr, is_huge, clear_hop3 = true; bool is_huge, clear_hop3 = true;
is_dram_addr = hl_mem_area_inside_range(virt_addr, PAGE_SIZE_2MB, mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu;
prop->va_space_dram_start_address,
prop->va_space_dram_end_address);
hop0_addr = get_hop0_addr(ctx); hop0_addr = get_hop0_addr(ctx);
hop0_pte_addr = get_hop0_pte_addr(ctx, hop0_addr, virt_addr); hop0_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop0_addr, virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop0_pte_addr; curr_pte = *(u64 *) (uintptr_t) hop0_pte_addr;
...@@ -539,7 +560,7 @@ static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr) ...@@ -539,7 +560,7 @@ static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr)
if (hop1_addr == ULLONG_MAX) if (hop1_addr == ULLONG_MAX)
goto not_mapped; goto not_mapped;
hop1_pte_addr = get_hop1_pte_addr(ctx, hop1_addr, virt_addr); hop1_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop1_addr, virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop1_pte_addr; curr_pte = *(u64 *) (uintptr_t) hop1_pte_addr;
...@@ -548,7 +569,7 @@ static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr) ...@@ -548,7 +569,7 @@ static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr)
if (hop2_addr == ULLONG_MAX) if (hop2_addr == ULLONG_MAX)
goto not_mapped; goto not_mapped;
hop2_pte_addr = get_hop2_pte_addr(ctx, hop2_addr, virt_addr); hop2_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop2_addr, virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop2_pte_addr; curr_pte = *(u64 *) (uintptr_t) hop2_pte_addr;
...@@ -557,7 +578,7 @@ static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr) ...@@ -557,7 +578,7 @@ static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr)
if (hop3_addr == ULLONG_MAX) if (hop3_addr == ULLONG_MAX)
goto not_mapped; goto not_mapped;
hop3_pte_addr = get_hop3_pte_addr(ctx, hop3_addr, virt_addr); hop3_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop3_addr, virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop3_pte_addr; curr_pte = *(u64 *) (uintptr_t) hop3_pte_addr;
...@@ -575,7 +596,8 @@ static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr) ...@@ -575,7 +596,8 @@ static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr)
if (hop4_addr == ULLONG_MAX) if (hop4_addr == ULLONG_MAX)
goto not_mapped; goto not_mapped;
hop4_pte_addr = get_hop4_pte_addr(ctx, hop4_addr, virt_addr); hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop4_addr,
virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop4_pte_addr; curr_pte = *(u64 *) (uintptr_t) hop4_pte_addr;
...@@ -584,7 +606,7 @@ static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr) ...@@ -584,7 +606,7 @@ static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr)
if (hdev->dram_default_page_mapping && is_dram_addr) { if (hdev->dram_default_page_mapping && is_dram_addr) {
u64 default_pte = (prop->mmu_dram_default_page_addr & u64 default_pte = (prop->mmu_dram_default_page_addr &
PTE_PHYS_ADDR_MASK) | LAST_MASK | HOP_PHYS_ADDR_MASK) | LAST_MASK |
PAGE_PRESENT_MASK; PAGE_PRESENT_MASK;
if (curr_pte == default_pte) { if (curr_pte == default_pte) {
dev_err(hdev->dev, dev_err(hdev->dev,
...@@ -667,25 +689,36 @@ static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr) ...@@ -667,25 +689,36 @@ static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr)
int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size) int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size)
{ {
struct hl_device *hdev = ctx->hdev; struct hl_device *hdev = ctx->hdev;
struct asic_fixed_properties *prop = &hdev->asic_prop;
struct hl_mmu_properties *mmu_prop;
u64 real_virt_addr; u64 real_virt_addr;
u32 real_page_size, npages; u32 real_page_size, npages;
int i, rc; int i, rc;
bool is_dram_addr;
if (!hdev->mmu_enable) if (!hdev->mmu_enable)
return 0; return 0;
is_dram_addr = hl_mem_area_inside_range(virt_addr, prop->dmmu.page_size,
prop->va_space_dram_start_address,
prop->va_space_dram_end_address);
mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu;
/* /*
* The H/W handles mapping of 4KB/2MB page. Hence if the host page size * The H/W handles mapping of specific page sizes. Hence if the page
* is bigger, we break it to sub-pages and unmap them separately. * size is bigger, we break it to sub-pages and unmap them separately.
*/ */
if ((page_size % PAGE_SIZE_2MB) == 0) { if ((page_size % mmu_prop->huge_page_size) == 0) {
real_page_size = PAGE_SIZE_2MB; real_page_size = mmu_prop->huge_page_size;
} else if ((page_size % PAGE_SIZE_4KB) == 0) { } else if ((page_size % mmu_prop->page_size) == 0) {
real_page_size = PAGE_SIZE_4KB; real_page_size = mmu_prop->page_size;
} else { } else {
dev_err(hdev->dev, dev_err(hdev->dev,
"page size of %u is not 4KB nor 2MB aligned, can't unmap\n", "page size of %u is not %uKB nor %uMB aligned, can't unmap\n",
page_size); page_size,
mmu_prop->page_size >> 10,
mmu_prop->huge_page_size >> 20);
return -EFAULT; return -EFAULT;
} }
...@@ -694,7 +727,7 @@ int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size) ...@@ -694,7 +727,7 @@ int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size)
real_virt_addr = virt_addr; real_virt_addr = virt_addr;
for (i = 0 ; i < npages ; i++) { for (i = 0 ; i < npages ; i++) {
rc = _hl_mmu_unmap(ctx, real_virt_addr); rc = _hl_mmu_unmap(ctx, real_virt_addr, is_dram_addr);
if (rc) if (rc)
return rc; return rc;
...@@ -705,10 +738,11 @@ int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size) ...@@ -705,10 +738,11 @@ int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size)
} }
static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
u32 page_size) u32 page_size, bool is_dram_addr)
{ {
struct hl_device *hdev = ctx->hdev; struct hl_device *hdev = ctx->hdev;
struct asic_fixed_properties *prop = &hdev->asic_prop; struct asic_fixed_properties *prop = &hdev->asic_prop;
struct hl_mmu_properties *mmu_prop;
u64 hop0_addr = 0, hop0_pte_addr = 0, u64 hop0_addr = 0, hop0_pte_addr = 0,
hop1_addr = 0, hop1_pte_addr = 0, hop1_addr = 0, hop1_pte_addr = 0,
hop2_addr = 0, hop2_pte_addr = 0, hop2_addr = 0, hop2_pte_addr = 0,
...@@ -716,21 +750,19 @@ static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, ...@@ -716,21 +750,19 @@ static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
hop4_addr = 0, hop4_pte_addr = 0, hop4_addr = 0, hop4_pte_addr = 0,
curr_pte = 0; curr_pte = 0;
bool hop1_new = false, hop2_new = false, hop3_new = false, bool hop1_new = false, hop2_new = false, hop3_new = false,
hop4_new = false, is_huge, is_dram_addr; hop4_new = false, is_huge;
int rc = -ENOMEM; int rc = -ENOMEM;
mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu;
/* /*
* This mapping function can map a 4KB/2MB page. For 2MB page there are * This mapping function can map a page or a huge page. For huge page
* only 3 hops rather than 4. Currently the DRAM allocation uses 2MB * there are only 3 hops rather than 4. Currently the DRAM allocation
* pages only but user memory could have been allocated with one of the * uses huge pages only but user memory could have been allocated with
* two page sizes. Since this is a common code for all the three cases, * one of the two page sizes. Since this is a common code for all the
* we need this hugs page check. * three cases, we need this hugs page check.
*/ */
is_huge = page_size == PAGE_SIZE_2MB; is_huge = page_size == mmu_prop->huge_page_size;
is_dram_addr = hl_mem_area_inside_range(virt_addr, page_size,
prop->va_space_dram_start_address,
prop->va_space_dram_end_address);
if (is_dram_addr && !is_huge) { if (is_dram_addr && !is_huge) {
dev_err(hdev->dev, "DRAM mapping should use huge pages only\n"); dev_err(hdev->dev, "DRAM mapping should use huge pages only\n");
...@@ -738,28 +770,28 @@ static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, ...@@ -738,28 +770,28 @@ static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
} }
hop0_addr = get_hop0_addr(ctx); hop0_addr = get_hop0_addr(ctx);
hop0_pte_addr = get_hop0_pte_addr(ctx, hop0_addr, virt_addr); hop0_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop0_addr, virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop0_pte_addr; curr_pte = *(u64 *) (uintptr_t) hop0_pte_addr;
hop1_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop1_new); hop1_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop1_new);
if (hop1_addr == ULLONG_MAX) if (hop1_addr == ULLONG_MAX)
goto err; goto err;
hop1_pte_addr = get_hop1_pte_addr(ctx, hop1_addr, virt_addr); hop1_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop1_addr, virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop1_pte_addr; curr_pte = *(u64 *) (uintptr_t) hop1_pte_addr;
hop2_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop2_new); hop2_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop2_new);
if (hop2_addr == ULLONG_MAX) if (hop2_addr == ULLONG_MAX)
goto err; goto err;
hop2_pte_addr = get_hop2_pte_addr(ctx, hop2_addr, virt_addr); hop2_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop2_addr, virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop2_pte_addr; curr_pte = *(u64 *) (uintptr_t) hop2_pte_addr;
hop3_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop3_new); hop3_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop3_new);
if (hop3_addr == ULLONG_MAX) if (hop3_addr == ULLONG_MAX)
goto err; goto err;
hop3_pte_addr = get_hop3_pte_addr(ctx, hop3_addr, virt_addr); hop3_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop3_addr, virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop3_pte_addr; curr_pte = *(u64 *) (uintptr_t) hop3_pte_addr;
if (!is_huge) { if (!is_huge) {
...@@ -767,13 +799,14 @@ static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, ...@@ -767,13 +799,14 @@ static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
if (hop4_addr == ULLONG_MAX) if (hop4_addr == ULLONG_MAX)
goto err; goto err;
hop4_pte_addr = get_hop4_pte_addr(ctx, hop4_addr, virt_addr); hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop4_addr,
virt_addr);
curr_pte = *(u64 *) (uintptr_t) hop4_pte_addr; curr_pte = *(u64 *) (uintptr_t) hop4_pte_addr;
} }
if (hdev->dram_default_page_mapping && is_dram_addr) { if (hdev->dram_default_page_mapping && is_dram_addr) {
u64 default_pte = (prop->mmu_dram_default_page_addr & u64 default_pte = (prop->mmu_dram_default_page_addr &
PTE_PHYS_ADDR_MASK) | LAST_MASK | HOP_PHYS_ADDR_MASK) | LAST_MASK |
PAGE_PRESENT_MASK; PAGE_PRESENT_MASK;
if (curr_pte != default_pte) { if (curr_pte != default_pte) {
...@@ -813,7 +846,7 @@ static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, ...@@ -813,7 +846,7 @@ static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
goto err; goto err;
} }
curr_pte = (phys_addr & PTE_PHYS_ADDR_MASK) | LAST_MASK curr_pte = (phys_addr & HOP_PHYS_ADDR_MASK) | LAST_MASK
| PAGE_PRESENT_MASK; | PAGE_PRESENT_MASK;
if (is_huge) if (is_huge)
...@@ -823,25 +856,25 @@ static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, ...@@ -823,25 +856,25 @@ static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
if (hop1_new) { if (hop1_new) {
curr_pte = curr_pte =
(hop1_addr & PTE_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK; (hop1_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
write_pte(ctx, hop0_pte_addr, curr_pte); write_pte(ctx, hop0_pte_addr, curr_pte);
} }
if (hop2_new) { if (hop2_new) {
curr_pte = curr_pte =
(hop2_addr & PTE_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK; (hop2_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
write_pte(ctx, hop1_pte_addr, curr_pte); write_pte(ctx, hop1_pte_addr, curr_pte);
get_pte(ctx, hop1_addr); get_pte(ctx, hop1_addr);
} }
if (hop3_new) { if (hop3_new) {
curr_pte = curr_pte =
(hop3_addr & PTE_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK; (hop3_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
write_pte(ctx, hop2_pte_addr, curr_pte); write_pte(ctx, hop2_pte_addr, curr_pte);
get_pte(ctx, hop2_addr); get_pte(ctx, hop2_addr);
} }
if (!is_huge) { if (!is_huge) {
if (hop4_new) { if (hop4_new) {
curr_pte = (hop4_addr & PTE_PHYS_ADDR_MASK) | curr_pte = (hop4_addr & HOP_PHYS_ADDR_MASK) |
PAGE_PRESENT_MASK; PAGE_PRESENT_MASK;
write_pte(ctx, hop3_pte_addr, curr_pte); write_pte(ctx, hop3_pte_addr, curr_pte);
get_pte(ctx, hop3_addr); get_pte(ctx, hop3_addr);
...@@ -890,25 +923,36 @@ static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, ...@@ -890,25 +923,36 @@ static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size) int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size)
{ {
struct hl_device *hdev = ctx->hdev; struct hl_device *hdev = ctx->hdev;
struct asic_fixed_properties *prop = &hdev->asic_prop;
struct hl_mmu_properties *mmu_prop;
u64 real_virt_addr, real_phys_addr; u64 real_virt_addr, real_phys_addr;
u32 real_page_size, npages; u32 real_page_size, npages;
int i, rc, mapped_cnt = 0; int i, rc, mapped_cnt = 0;
bool is_dram_addr;
if (!hdev->mmu_enable) if (!hdev->mmu_enable)
return 0; return 0;
is_dram_addr = hl_mem_area_inside_range(virt_addr, prop->dmmu.page_size,
prop->va_space_dram_start_address,
prop->va_space_dram_end_address);
mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu;
/* /*
* The H/W handles mapping of 4KB/2MB page. Hence if the host page size * The H/W handles mapping of specific page sizes. Hence if the page
* is bigger, we break it to sub-pages and map them separately. * size is bigger, we break it to sub-pages and map them separately.
*/ */
if ((page_size % PAGE_SIZE_2MB) == 0) { if ((page_size % mmu_prop->huge_page_size) == 0) {
real_page_size = PAGE_SIZE_2MB; real_page_size = mmu_prop->huge_page_size;
} else if ((page_size % PAGE_SIZE_4KB) == 0) { } else if ((page_size % mmu_prop->page_size) == 0) {
real_page_size = PAGE_SIZE_4KB; real_page_size = mmu_prop->page_size;
} else { } else {
dev_err(hdev->dev, dev_err(hdev->dev,
"page size of %u is not 4KB nor 2MB aligned, can't map\n", "page size of %u is not %dKB nor %dMB aligned, can't unmap\n",
page_size); page_size,
mmu_prop->page_size >> 10,
mmu_prop->huge_page_size >> 20);
return -EFAULT; return -EFAULT;
} }
...@@ -923,7 +967,7 @@ int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size) ...@@ -923,7 +967,7 @@ int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size)
for (i = 0 ; i < npages ; i++) { for (i = 0 ; i < npages ; i++) {
rc = _hl_mmu_map(ctx, real_virt_addr, real_phys_addr, rc = _hl_mmu_map(ctx, real_virt_addr, real_phys_addr,
real_page_size); real_page_size, is_dram_addr);
if (rc) if (rc)
goto err; goto err;
...@@ -937,7 +981,7 @@ int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size) ...@@ -937,7 +981,7 @@ int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size)
err: err:
real_virt_addr = virt_addr; real_virt_addr = virt_addr;
for (i = 0 ; i < mapped_cnt ; i++) { for (i = 0 ; i < mapped_cnt ; i++) {
if (_hl_mmu_unmap(ctx, real_virt_addr)) if (_hl_mmu_unmap(ctx, real_virt_addr, is_dram_addr))
dev_warn_ratelimited(hdev->dev, dev_warn_ratelimited(hdev->dev,
"failed to unmap va: 0x%llx\n", real_virt_addr); "failed to unmap va: 0x%llx\n", real_virt_addr);
......
...@@ -95,6 +95,12 @@ enum hl_device_status { ...@@ -95,6 +95,12 @@ enum hl_device_status {
* percentage of the utilization rate. * percentage of the utilization rate.
* HL_INFO_HW_EVENTS_AGGREGATE - Receive an array describing how many times each * HL_INFO_HW_EVENTS_AGGREGATE - Receive an array describing how many times each
* event occurred since the driver was loaded. * event occurred since the driver was loaded.
* HL_INFO_CLK_RATE - Retrieve the current and maximum clock rate
* of the device in MHz. The maximum clock rate is
* configurable via sysfs parameter
* HL_INFO_RESET_COUNT - Retrieve the counts of the soft and hard reset
* operations performed on the device since the last
* time the driver was loaded.
*/ */
#define HL_INFO_HW_IP_INFO 0 #define HL_INFO_HW_IP_INFO 0
#define HL_INFO_HW_EVENTS 1 #define HL_INFO_HW_EVENTS 1
...@@ -103,8 +109,11 @@ enum hl_device_status { ...@@ -103,8 +109,11 @@ enum hl_device_status {
#define HL_INFO_DEVICE_STATUS 4 #define HL_INFO_DEVICE_STATUS 4
#define HL_INFO_DEVICE_UTILIZATION 6 #define HL_INFO_DEVICE_UTILIZATION 6
#define HL_INFO_HW_EVENTS_AGGREGATE 7 #define HL_INFO_HW_EVENTS_AGGREGATE 7
#define HL_INFO_CLK_RATE 8
#define HL_INFO_RESET_COUNT 9
#define HL_INFO_VERSION_MAX_LEN 128 #define HL_INFO_VERSION_MAX_LEN 128
#define HL_INFO_CARD_NAME_MAX_LEN 16
struct hl_info_hw_ip_info { struct hl_info_hw_ip_info {
__u64 sram_base_address; __u64 sram_base_address;
...@@ -123,6 +132,7 @@ struct hl_info_hw_ip_info { ...@@ -123,6 +132,7 @@ struct hl_info_hw_ip_info {
__u8 dram_enabled; __u8 dram_enabled;
__u8 pad[2]; __u8 pad[2];
__u8 armcp_version[HL_INFO_VERSION_MAX_LEN]; __u8 armcp_version[HL_INFO_VERSION_MAX_LEN];
__u8 card_name[HL_INFO_CARD_NAME_MAX_LEN];
}; };
struct hl_info_dram_usage { struct hl_info_dram_usage {
...@@ -149,6 +159,16 @@ struct hl_info_device_utilization { ...@@ -149,6 +159,16 @@ struct hl_info_device_utilization {
__u32 pad; __u32 pad;
}; };
struct hl_info_clk_rate {
__u32 cur_clk_rate_mhz;
__u32 max_clk_rate_mhz;
};
struct hl_info_reset_count {
__u32 hard_reset_cnt;
__u32 soft_reset_cnt;
};
struct hl_info_args { struct hl_info_args {
/* Location of relevant struct in userspace */ /* Location of relevant struct in userspace */
__u64 return_pointer; __u64 return_pointer;
...@@ -181,13 +201,15 @@ struct hl_info_args { ...@@ -181,13 +201,15 @@ struct hl_info_args {
/* Opcode to destroy previously created command buffer */ /* Opcode to destroy previously created command buffer */
#define HL_CB_OP_DESTROY 1 #define HL_CB_OP_DESTROY 1
#define HL_MAX_CB_SIZE 0x200000 /* 2MB */
struct hl_cb_in { struct hl_cb_in {
/* Handle of CB or 0 if we want to create one */ /* Handle of CB or 0 if we want to create one */
__u64 cb_handle; __u64 cb_handle;
/* HL_CB_OP_* */ /* HL_CB_OP_* */
__u32 op; __u32 op;
/* Size of CB. Maximum size is 2MB. The minimum size that will be /* Size of CB. Maximum size is HL_MAX_CB_SIZE. The minimum size that
* allocated, regardless of this parameter's value, is PAGE_SIZE * will be allocated, regardless of this parameter's value, is PAGE_SIZE
*/ */
__u32 cb_size; __u32 cb_size;
/* Context ID - Currently not in use */ /* Context ID - Currently not in use */
...@@ -233,6 +255,8 @@ struct hl_cs_chunk { ...@@ -233,6 +255,8 @@ struct hl_cs_chunk {
#define HL_CS_STATUS_SUCCESS 0 #define HL_CS_STATUS_SUCCESS 0
#define HL_MAX_JOBS_PER_CS 512
struct hl_cs_in { struct hl_cs_in {
/* this holds address of array of hl_cs_chunk for restore phase */ /* this holds address of array of hl_cs_chunk for restore phase */
__u64 chunks_restore; __u64 chunks_restore;
...@@ -242,9 +266,13 @@ struct hl_cs_in { ...@@ -242,9 +266,13 @@ struct hl_cs_in {
* Currently not in use * Currently not in use
*/ */
__u64 chunks_store; __u64 chunks_store;
/* Number of chunks in restore phase array */ /* Number of chunks in restore phase array. Maximum number is
* HL_MAX_JOBS_PER_CS
*/
__u32 num_chunks_restore; __u32 num_chunks_restore;
/* Number of chunks in execution array */ /* Number of chunks in execution array. Maximum number is
* HL_MAX_JOBS_PER_CS
*/
__u32 num_chunks_execute; __u32 num_chunks_execute;
/* Number of chunks in restore phase array - Currently not in use */ /* Number of chunks in restore phase array - Currently not in use */
__u32 num_chunks_store; __u32 num_chunks_store;
...@@ -589,7 +617,7 @@ struct hl_debug_args { ...@@ -589,7 +617,7 @@ struct hl_debug_args {
* *
* The user can call this IOCTL with a handle it received from the CS IOCTL * The user can call this IOCTL with a handle it received from the CS IOCTL
* to wait until the handle's CS has finished executing. The user will wait * to wait until the handle's CS has finished executing. The user will wait
* inside the kernel until the CS has finished or until the user-requeusted * inside the kernel until the CS has finished or until the user-requested
* timeout has expired. * timeout has expired.
* *
* The return value of the IOCTL is a standard Linux error code. The possible * The return value of the IOCTL is a standard Linux error code. The possible
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment