Commit d4e8ad90 authored by Christian König's avatar Christian König Committed by Alex Deucher

drm/amdgpu: reorder CS code

Sort the functions in the order they are called
Signed-off-by: default avatarChristian König <christian.koenig@amd.com>
Reviewed-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 88c98d54
......@@ -278,872 +278,881 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
return ret;
}
/* Convert microseconds to bytes. */
static u64 us_to_bytes(struct amdgpu_device *adev, s64 us)
static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
struct amdgpu_cs_parser *parser)
{
if (us <= 0 || !adev->mm_stats.log2_max_MBps)
return 0;
struct amdgpu_fpriv *fpriv = parser->filp->driver_priv;
struct amdgpu_vm *vm = &fpriv->vm;
int r, ce_preempt = 0, de_preempt = 0;
struct amdgpu_ring *ring;
int i, j;
/* Since accum_us is incremented by a million per second, just
* multiply it by the number of MB/s to get the number of bytes.
*/
return us << adev->mm_stats.log2_max_MBps;
}
for (i = 0, j = 0; i < parser->nchunks && j < parser->job->num_ibs; i++) {
struct amdgpu_cs_chunk *chunk;
struct amdgpu_ib *ib;
struct drm_amdgpu_cs_chunk_ib *chunk_ib;
struct drm_sched_entity *entity;
static s64 bytes_to_us(struct amdgpu_device *adev, u64 bytes)
{
if (!adev->mm_stats.log2_max_MBps)
return 0;
chunk = &parser->chunks[i];
ib = &parser->job->ibs[j];
chunk_ib = (struct drm_amdgpu_cs_chunk_ib *)chunk->kdata;
return bytes >> adev->mm_stats.log2_max_MBps;
}
if (chunk->chunk_id != AMDGPU_CHUNK_ID_IB)
continue;
/* Returns how many bytes TTM can move right now. If no bytes can be moved,
* it returns 0. If it returns non-zero, it's OK to move at least one buffer,
* which means it can go over the threshold once. If that happens, the driver
* will be in debt and no other buffer migrations can be done until that debt
* is repaid.
*
* This approach allows moving a buffer of any size (it's important to allow
* that).
*
* The currency is simply time in microseconds and it increases as the clock
* ticks. The accumulated microseconds (us) are converted to bytes and
* returned.
*/
static void amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev,
u64 *max_bytes,
u64 *max_vis_bytes)
{
s64 time_us, increment_us;
u64 free_vram, total_vram, used_vram;
/* Allow a maximum of 200 accumulated ms. This is basically per-IB
* throttling.
*
* It means that in order to get full max MBps, at least 5 IBs per
* second must be submitted and not more than 200ms apart from each
* other.
*/
const s64 us_upper_bound = 200000;
if (chunk_ib->ip_type == AMDGPU_HW_IP_GFX &&
chunk_ib->flags & AMDGPU_IB_FLAG_PREEMPT) {
if (chunk_ib->flags & AMDGPU_IB_FLAG_CE)
ce_preempt++;
else
de_preempt++;
if (!adev->mm_stats.log2_max_MBps) {
*max_bytes = 0;
*max_vis_bytes = 0;
return;
/* each GFX command submit allows 0 or 1 IB preemptible for CE & DE */
if (ce_preempt > 1 || de_preempt > 1)
return -EINVAL;
}
total_vram = adev->gmc.real_vram_size - atomic64_read(&adev->vram_pin_size);
used_vram = ttm_resource_manager_usage(&adev->mman.vram_mgr.manager);
free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram;
r = amdgpu_ctx_get_entity(parser->ctx, chunk_ib->ip_type,
chunk_ib->ip_instance, chunk_ib->ring,
&entity);
if (r)
return r;
spin_lock(&adev->mm_stats.lock);
if (chunk_ib->flags & AMDGPU_IB_FLAG_PREAMBLE)
parser->job->preamble_status |=
AMDGPU_PREAMBLE_IB_PRESENT;
/* Increase the amount of accumulated us. */
time_us = ktime_to_us(ktime_get());
increment_us = time_us - adev->mm_stats.last_update_us;
adev->mm_stats.last_update_us = time_us;
adev->mm_stats.accum_us = min(adev->mm_stats.accum_us + increment_us,
us_upper_bound);
if (parser->entity && parser->entity != entity)
return -EINVAL;
/* This prevents the short period of low performance when the VRAM
* usage is low and the driver is in debt or doesn't have enough
* accumulated us to fill VRAM quickly.
*
* The situation can occur in these cases:
* - a lot of VRAM is freed by userspace
* - the presence of a big buffer causes a lot of evictions
* (solution: split buffers into smaller ones)
*
* If 128 MB or 1/8th of VRAM is free, start filling it now by setting
* accum_us to a positive number.
*/
if (free_vram >= 128 * 1024 * 1024 || free_vram >= total_vram / 8) {
s64 min_us;
/* Return if there is no run queue associated with this entity.
* Possibly because of disabled HW IP*/
if (entity->rq == NULL)
return -EINVAL;
/* Be more aggressive on dGPUs. Try to fill a portion of free
* VRAM now.
*/
if (!(adev->flags & AMD_IS_APU))
min_us = bytes_to_us(adev, free_vram / 4);
else
min_us = 0; /* Reset accum_us on APUs. */
parser->entity = entity;
adev->mm_stats.accum_us = max(min_us, adev->mm_stats.accum_us);
ring = to_amdgpu_ring(entity->rq->sched);
r = amdgpu_ib_get(adev, vm, ring->funcs->parse_cs ?
chunk_ib->ib_bytes : 0,
AMDGPU_IB_POOL_DELAYED, ib);
if (r) {
DRM_ERROR("Failed to get ib !\n");
return r;
}
/* This is set to 0 if the driver is in debt to disallow (optional)
* buffer moves.
*/
*max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
ib->gpu_addr = chunk_ib->va_start;
ib->length_dw = chunk_ib->ib_bytes / 4;
ib->flags = chunk_ib->flags;
/* Do the same for visible VRAM if half of it is free */
if (!amdgpu_gmc_vram_full_visible(&adev->gmc)) {
u64 total_vis_vram = adev->gmc.visible_vram_size;
u64 used_vis_vram =
amdgpu_vram_mgr_vis_usage(&adev->mman.vram_mgr);
j++;
}
if (used_vis_vram < total_vis_vram) {
u64 free_vis_vram = total_vis_vram - used_vis_vram;
adev->mm_stats.accum_us_vis = min(adev->mm_stats.accum_us_vis +
increment_us, us_upper_bound);
/* MM engine doesn't support user fences */
ring = to_amdgpu_ring(parser->entity->rq->sched);
if (parser->job->uf_addr && ring->funcs->no_user_fence)
return -EINVAL;
if (free_vis_vram >= total_vis_vram / 2)
adev->mm_stats.accum_us_vis =
max(bytes_to_us(adev, free_vis_vram / 2),
adev->mm_stats.accum_us_vis);
return 0;
}
static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p,
struct amdgpu_cs_chunk *chunk)
{
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
unsigned num_deps;
int i, r;
struct drm_amdgpu_cs_chunk_dep *deps;
deps = (struct drm_amdgpu_cs_chunk_dep *)chunk->kdata;
num_deps = chunk->length_dw * 4 /
sizeof(struct drm_amdgpu_cs_chunk_dep);
for (i = 0; i < num_deps; ++i) {
struct amdgpu_ctx *ctx;
struct drm_sched_entity *entity;
struct dma_fence *fence;
ctx = amdgpu_ctx_get(fpriv, deps[i].ctx_id);
if (ctx == NULL)
return -EINVAL;
r = amdgpu_ctx_get_entity(ctx, deps[i].ip_type,
deps[i].ip_instance,
deps[i].ring, &entity);
if (r) {
amdgpu_ctx_put(ctx);
return r;
}
*max_vis_bytes = us_to_bytes(adev, adev->mm_stats.accum_us_vis);
} else {
*max_vis_bytes = 0;
fence = amdgpu_ctx_get_fence(ctx, entity, deps[i].handle);
amdgpu_ctx_put(ctx);
if (IS_ERR(fence))
return PTR_ERR(fence);
else if (!fence)
continue;
if (chunk->chunk_id == AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES) {
struct drm_sched_fence *s_fence;
struct dma_fence *old = fence;
s_fence = to_drm_sched_fence(fence);
fence = dma_fence_get(&s_fence->scheduled);
dma_fence_put(old);
}
spin_unlock(&adev->mm_stats.lock);
r = amdgpu_sync_fence(&p->job->sync, fence);
dma_fence_put(fence);
if (r)
return r;
}
return 0;
}
/* Report how many bytes have really been moved for the last command
* submission. This can result in a debt that can stop buffer migrations
* temporarily.
*/
void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes,
u64 num_vis_bytes)
static int amdgpu_syncobj_lookup_and_add_to_sync(struct amdgpu_cs_parser *p,
uint32_t handle, u64 point,
u64 flags)
{
spin_lock(&adev->mm_stats.lock);
adev->mm_stats.accum_us -= bytes_to_us(adev, num_bytes);
adev->mm_stats.accum_us_vis -= bytes_to_us(adev, num_vis_bytes);
spin_unlock(&adev->mm_stats.lock);
struct dma_fence *fence;
int r;
r = drm_syncobj_find_fence(p->filp, handle, point, flags, &fence);
if (r) {
DRM_ERROR("syncobj %u failed to find fence @ %llu (%d)!\n",
handle, point, r);
return r;
}
r = amdgpu_sync_fence(&p->job->sync, fence);
dma_fence_put(fence);
return r;
}
static int amdgpu_cs_bo_validate(void *param, struct amdgpu_bo *bo)
static int amdgpu_cs_process_syncobj_in_dep(struct amdgpu_cs_parser *p,
struct amdgpu_cs_chunk *chunk)
{
struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
struct amdgpu_cs_parser *p = param;
struct ttm_operation_ctx ctx = {
.interruptible = true,
.no_wait_gpu = false,
.resv = bo->tbo.base.resv
};
uint32_t domain;
int r;
struct drm_amdgpu_cs_chunk_sem *deps;
unsigned num_deps;
int i, r;
deps = (struct drm_amdgpu_cs_chunk_sem *)chunk->kdata;
num_deps = chunk->length_dw * 4 /
sizeof(struct drm_amdgpu_cs_chunk_sem);
for (i = 0; i < num_deps; ++i) {
r = amdgpu_syncobj_lookup_and_add_to_sync(p, deps[i].handle,
0, 0);
if (r)
return r;
}
if (bo->tbo.pin_count)
return 0;
}
/* Don't move this buffer if we have depleted our allowance
* to move it. Don't move anything if the threshold is zero.
*/
if (p->bytes_moved < p->bytes_moved_threshold &&
(!bo->tbo.base.dma_buf ||
list_empty(&bo->tbo.base.dma_buf->attachments))) {
if (!amdgpu_gmc_vram_full_visible(&adev->gmc) &&
(bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)) {
/* And don't move a CPU_ACCESS_REQUIRED BO to limited
* visible VRAM if we've depleted our allowance to do
* that.
*/
if (p->bytes_moved_vis < p->bytes_moved_vis_threshold)
domain = bo->preferred_domains;
else
domain = bo->allowed_domains;
} else {
domain = bo->preferred_domains;
}
} else {
domain = bo->allowed_domains;
static int amdgpu_cs_process_syncobj_timeline_in_dep(struct amdgpu_cs_parser *p,
struct amdgpu_cs_chunk *chunk)
{
struct drm_amdgpu_cs_chunk_syncobj *syncobj_deps;
unsigned num_deps;
int i, r;
syncobj_deps = (struct drm_amdgpu_cs_chunk_syncobj *)chunk->kdata;
num_deps = chunk->length_dw * 4 /
sizeof(struct drm_amdgpu_cs_chunk_syncobj);
for (i = 0; i < num_deps; ++i) {
r = amdgpu_syncobj_lookup_and_add_to_sync(p,
syncobj_deps[i].handle,
syncobj_deps[i].point,
syncobj_deps[i].flags);
if (r)
return r;
}
retry:
amdgpu_bo_placement_from_domain(bo, domain);
r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
return 0;
}
static int amdgpu_cs_process_syncobj_out_dep(struct amdgpu_cs_parser *p,
struct amdgpu_cs_chunk *chunk)
{
struct drm_amdgpu_cs_chunk_sem *deps;
unsigned num_deps;
int i;
deps = (struct drm_amdgpu_cs_chunk_sem *)chunk->kdata;
num_deps = chunk->length_dw * 4 /
sizeof(struct drm_amdgpu_cs_chunk_sem);
if (p->post_deps)
return -EINVAL;
p->post_deps = kmalloc_array(num_deps, sizeof(*p->post_deps),
GFP_KERNEL);
p->num_post_deps = 0;
if (!p->post_deps)
return -ENOMEM;
p->bytes_moved += ctx.bytes_moved;
if (!amdgpu_gmc_vram_full_visible(&adev->gmc) &&
amdgpu_bo_in_cpu_visible_vram(bo))
p->bytes_moved_vis += ctx.bytes_moved;
if (unlikely(r == -ENOMEM) && domain != bo->allowed_domains) {
domain = bo->allowed_domains;
goto retry;
for (i = 0; i < num_deps; ++i) {
p->post_deps[i].syncobj =
drm_syncobj_find(p->filp, deps[i].handle);
if (!p->post_deps[i].syncobj)
return -EINVAL;
p->post_deps[i].chain = NULL;
p->post_deps[i].point = 0;
p->num_post_deps++;
}
return r;
return 0;
}
static int amdgpu_cs_list_validate(struct amdgpu_cs_parser *p,
struct list_head *validated)
static int amdgpu_cs_process_syncobj_timeline_out_dep(struct amdgpu_cs_parser *p,
struct amdgpu_cs_chunk *chunk)
{
struct ttm_operation_ctx ctx = { true, false };
struct amdgpu_bo_list_entry *lobj;
int r;
struct drm_amdgpu_cs_chunk_syncobj *syncobj_deps;
unsigned num_deps;
int i;
list_for_each_entry(lobj, validated, tv.head) {
struct amdgpu_bo *bo = ttm_to_amdgpu_bo(lobj->tv.bo);
struct mm_struct *usermm;
syncobj_deps = (struct drm_amdgpu_cs_chunk_syncobj *)chunk->kdata;
num_deps = chunk->length_dw * 4 /
sizeof(struct drm_amdgpu_cs_chunk_syncobj);
usermm = amdgpu_ttm_tt_get_usermm(bo->tbo.ttm);
if (usermm && usermm != current->mm)
return -EPERM;
if (p->post_deps)
return -EINVAL;
if (amdgpu_ttm_tt_is_userptr(bo->tbo.ttm) &&
lobj->user_invalidated && lobj->user_pages) {
amdgpu_bo_placement_from_domain(bo,
AMDGPU_GEM_DOMAIN_CPU);
r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
if (r)
return r;
p->post_deps = kmalloc_array(num_deps, sizeof(*p->post_deps),
GFP_KERNEL);
p->num_post_deps = 0;
amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm,
lobj->user_pages);
}
if (!p->post_deps)
return -ENOMEM;
r = amdgpu_cs_bo_validate(p, bo);
if (r)
return r;
for (i = 0; i < num_deps; ++i) {
struct amdgpu_cs_post_dep *dep = &p->post_deps[i];
kvfree(lobj->user_pages);
lobj->user_pages = NULL;
dep->chain = NULL;
if (syncobj_deps[i].point) {
dep->chain = dma_fence_chain_alloc();
if (!dep->chain)
return -ENOMEM;
}
dep->syncobj = drm_syncobj_find(p->filp,
syncobj_deps[i].handle);
if (!dep->syncobj) {
dma_fence_chain_free(dep->chain);
return -EINVAL;
}
dep->point = syncobj_deps[i].point;
p->num_post_deps++;
}
return 0;
}
static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
union drm_amdgpu_cs *cs)
static int amdgpu_cs_dependencies(struct amdgpu_device *adev,
struct amdgpu_cs_parser *p)
{
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
struct amdgpu_vm *vm = &fpriv->vm;
struct amdgpu_bo_list_entry *e;
struct list_head duplicates;
int r;
int i, r;
INIT_LIST_HEAD(&p->validated);
for (i = 0; i < p->nchunks; ++i) {
struct amdgpu_cs_chunk *chunk;
/* p->bo_list could already be assigned if AMDGPU_CHUNK_ID_BO_HANDLES is present */
if (cs->in.bo_list_handle) {
if (p->bo_list)
return -EINVAL;
chunk = &p->chunks[i];
r = amdgpu_bo_list_get(fpriv, cs->in.bo_list_handle,
&p->bo_list);
switch (chunk->chunk_id) {
case AMDGPU_CHUNK_ID_DEPENDENCIES:
case AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES:
r = amdgpu_cs_process_fence_dep(p, chunk);
if (r)
return r;
} else if (!p->bo_list) {
/* Create a empty bo_list when no handle is provided */
r = amdgpu_bo_list_create(p->adev, p->filp, NULL, 0,
&p->bo_list);
break;
case AMDGPU_CHUNK_ID_SYNCOBJ_IN:
r = amdgpu_cs_process_syncobj_in_dep(p, chunk);
if (r)
return r;
break;
case AMDGPU_CHUNK_ID_SYNCOBJ_OUT:
r = amdgpu_cs_process_syncobj_out_dep(p, chunk);
if (r)
return r;
break;
case AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT:
r = amdgpu_cs_process_syncobj_timeline_in_dep(p, chunk);
if (r)
return r;
break;
case AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL:
r = amdgpu_cs_process_syncobj_timeline_out_dep(p, chunk);
if (r)
return r;
break;
}
mutex_lock(&p->bo_list->bo_list_mutex);
/* One for TTM and one for the CS job */
amdgpu_bo_list_for_each_entry(e, p->bo_list)
e->tv.num_shared = 2;
amdgpu_bo_list_get_list(p->bo_list, &p->validated);
INIT_LIST_HEAD(&duplicates);
amdgpu_vm_get_pd_bo(&fpriv->vm, &p->validated, &p->vm_pd);
if (p->uf_entry.tv.bo && !ttm_to_amdgpu_bo(p->uf_entry.tv.bo)->parent)
list_add(&p->uf_entry.tv.head, &p->validated);
/* Get userptr backing pages. If pages are updated after registered
* in amdgpu_gem_userptr_ioctl(), amdgpu_cs_list_validate() will do
* amdgpu_ttm_backend_bind() to flush and invalidate new pages
*/
amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
bool userpage_invalidated = false;
int i;
e->user_pages = kvmalloc_array(bo->tbo.ttm->num_pages,
sizeof(struct page *),
GFP_KERNEL | __GFP_ZERO);
if (!e->user_pages) {
DRM_ERROR("kvmalloc_array failure\n");
r = -ENOMEM;
goto out_free_user_pages;
}
r = amdgpu_ttm_tt_get_user_pages(bo, e->user_pages);
if (r) {
kvfree(e->user_pages);
e->user_pages = NULL;
goto out_free_user_pages;
}
return 0;
}
for (i = 0; i < bo->tbo.ttm->num_pages; i++) {
if (bo->tbo.ttm->pages[i] != e->user_pages[i]) {
userpage_invalidated = true;
break;
}
}
e->user_invalidated = userpage_invalidated;
}
/* Convert microseconds to bytes. */
static u64 us_to_bytes(struct amdgpu_device *adev, s64 us)
{
if (us <= 0 || !adev->mm_stats.log2_max_MBps)
return 0;
r = ttm_eu_reserve_buffers(&p->ticket, &p->validated, true,
&duplicates);
if (unlikely(r != 0)) {
if (r != -ERESTARTSYS)
DRM_ERROR("ttm_eu_reserve_buffers failed.\n");
goto out_free_user_pages;
}
/* Since accum_us is incremented by a million per second, just
* multiply it by the number of MB/s to get the number of bytes.
*/
return us << adev->mm_stats.log2_max_MBps;
}
amdgpu_bo_list_for_each_entry(e, p->bo_list) {
struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
static s64 bytes_to_us(struct amdgpu_device *adev, u64 bytes)
{
if (!adev->mm_stats.log2_max_MBps)
return 0;
e->bo_va = amdgpu_vm_bo_find(vm, bo);
}
return bytes >> adev->mm_stats.log2_max_MBps;
}
/* Move fence waiting after getting reservation lock of
* PD root. Then there is no need on a ctx mutex lock.
/* Returns how many bytes TTM can move right now. If no bytes can be moved,
* it returns 0. If it returns non-zero, it's OK to move at least one buffer,
* which means it can go over the threshold once. If that happens, the driver
* will be in debt and no other buffer migrations can be done until that debt
* is repaid.
*
* This approach allows moving a buffer of any size (it's important to allow
* that).
*
* The currency is simply time in microseconds and it increases as the clock
* ticks. The accumulated microseconds (us) are converted to bytes and
* returned.
*/
r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entity);
if (unlikely(r != 0)) {
if (r != -ERESTARTSYS)
DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n");
goto error_validate;
}
amdgpu_cs_get_threshold_for_moves(p->adev, &p->bytes_moved_threshold,
&p->bytes_moved_vis_threshold);
p->bytes_moved = 0;
p->bytes_moved_vis = 0;
static void amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev,
u64 *max_bytes,
u64 *max_vis_bytes)
{
s64 time_us, increment_us;
u64 free_vram, total_vram, used_vram;
/* Allow a maximum of 200 accumulated ms. This is basically per-IB
* throttling.
*
* It means that in order to get full max MBps, at least 5 IBs per
* second must be submitted and not more than 200ms apart from each
* other.
*/
const s64 us_upper_bound = 200000;
r = amdgpu_vm_validate_pt_bos(p->adev, &fpriv->vm,
amdgpu_cs_bo_validate, p);
if (r) {
DRM_ERROR("amdgpu_vm_validate_pt_bos() failed.\n");
goto error_validate;
if (!adev->mm_stats.log2_max_MBps) {
*max_bytes = 0;
*max_vis_bytes = 0;
return;
}
r = amdgpu_cs_list_validate(p, &duplicates);
if (r)
goto error_validate;
total_vram = adev->gmc.real_vram_size - atomic64_read(&adev->vram_pin_size);
used_vram = ttm_resource_manager_usage(&adev->mman.vram_mgr.manager);
free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram;
r = amdgpu_cs_list_validate(p, &p->validated);
if (r)
goto error_validate;
spin_lock(&adev->mm_stats.lock);
amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
p->bytes_moved_vis);
/* Increase the amount of accumulated us. */
time_us = ktime_to_us(ktime_get());
increment_us = time_us - adev->mm_stats.last_update_us;
adev->mm_stats.last_update_us = time_us;
adev->mm_stats.accum_us = min(adev->mm_stats.accum_us + increment_us,
us_upper_bound);
amdgpu_job_set_resources(p->job, p->bo_list->gds_obj,
p->bo_list->gws_obj, p->bo_list->oa_obj);
/* This prevents the short period of low performance when the VRAM
* usage is low and the driver is in debt or doesn't have enough
* accumulated us to fill VRAM quickly.
*
* The situation can occur in these cases:
* - a lot of VRAM is freed by userspace
* - the presence of a big buffer causes a lot of evictions
* (solution: split buffers into smaller ones)
*
* If 128 MB or 1/8th of VRAM is free, start filling it now by setting
* accum_us to a positive number.
*/
if (free_vram >= 128 * 1024 * 1024 || free_vram >= total_vram / 8) {
s64 min_us;
if (!r && p->uf_entry.tv.bo) {
struct amdgpu_bo *uf = ttm_to_amdgpu_bo(p->uf_entry.tv.bo);
/* Be more aggressive on dGPUs. Try to fill a portion of free
* VRAM now.
*/
if (!(adev->flags & AMD_IS_APU))
min_us = bytes_to_us(adev, free_vram / 4);
else
min_us = 0; /* Reset accum_us on APUs. */
r = amdgpu_ttm_alloc_gart(&uf->tbo);
p->job->uf_addr += amdgpu_bo_gpu_offset(uf);
adev->mm_stats.accum_us = max(min_us, adev->mm_stats.accum_us);
}
error_validate:
if (r)
ttm_eu_backoff_reservation(&p->ticket, &p->validated);
/* This is set to 0 if the driver is in debt to disallow (optional)
* buffer moves.
*/
*max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
out_free_user_pages:
if (r) {
amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
/* Do the same for visible VRAM if half of it is free */
if (!amdgpu_gmc_vram_full_visible(&adev->gmc)) {
u64 total_vis_vram = adev->gmc.visible_vram_size;
u64 used_vis_vram =
amdgpu_vram_mgr_vis_usage(&adev->mman.vram_mgr);
if (!e->user_pages)
continue;
amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
kvfree(e->user_pages);
e->user_pages = NULL;
if (used_vis_vram < total_vis_vram) {
u64 free_vis_vram = total_vis_vram - used_vis_vram;
adev->mm_stats.accum_us_vis = min(adev->mm_stats.accum_us_vis +
increment_us, us_upper_bound);
if (free_vis_vram >= total_vis_vram / 2)
adev->mm_stats.accum_us_vis =
max(bytes_to_us(adev, free_vis_vram / 2),
adev->mm_stats.accum_us_vis);
}
mutex_unlock(&p->bo_list->bo_list_mutex);
*max_vis_bytes = us_to_bytes(adev, adev->mm_stats.accum_us_vis);
} else {
*max_vis_bytes = 0;
}
return r;
spin_unlock(&adev->mm_stats.lock);
}
static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
/* Report how many bytes have really been moved for the last command
* submission. This can result in a debt that can stop buffer migrations
* temporarily.
*/
void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes,
u64 num_vis_bytes)
{
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
struct amdgpu_bo_list_entry *e;
int r;
list_for_each_entry(e, &p->validated, tv.head) {
struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
struct dma_resv *resv = bo->tbo.base.resv;
enum amdgpu_sync_mode sync_mode;
sync_mode = amdgpu_bo_explicit_sync(bo) ?
AMDGPU_SYNC_EXPLICIT : AMDGPU_SYNC_NE_OWNER;
r = amdgpu_sync_resv(p->adev, &p->job->sync, resv, sync_mode,
&fpriv->vm);
if (r)
return r;
}
return 0;
spin_lock(&adev->mm_stats.lock);
adev->mm_stats.accum_us -= bytes_to_us(adev, num_bytes);
adev->mm_stats.accum_us_vis -= bytes_to_us(adev, num_vis_bytes);
spin_unlock(&adev->mm_stats.lock);
}
static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
static int amdgpu_cs_bo_validate(void *param, struct amdgpu_bo *bo)
{
struct amdgpu_ring *ring = to_amdgpu_ring(p->entity->rq->sched);
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
struct amdgpu_device *adev = p->adev;
struct amdgpu_vm *vm = &fpriv->vm;
struct amdgpu_bo_list_entry *e;
struct amdgpu_bo_va *bo_va;
struct amdgpu_bo *bo;
struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
struct amdgpu_cs_parser *p = param;
struct ttm_operation_ctx ctx = {
.interruptible = true,
.no_wait_gpu = false,
.resv = bo->tbo.base.resv
};
uint32_t domain;
int r;
/* Only for UVD/VCE VM emulation */
if (ring->funcs->parse_cs || ring->funcs->patch_cs_in_place) {
unsigned i, j;
for (i = 0, j = 0; i < p->nchunks && j < p->job->num_ibs; i++) {
struct drm_amdgpu_cs_chunk_ib *chunk_ib;
struct amdgpu_bo_va_mapping *m;
struct amdgpu_bo *aobj = NULL;
struct amdgpu_cs_chunk *chunk;
uint64_t offset, va_start;
struct amdgpu_ib *ib;
uint8_t *kptr;
chunk = &p->chunks[i];
ib = &p->job->ibs[j];
chunk_ib = chunk->kdata;
if (chunk->chunk_id != AMDGPU_CHUNK_ID_IB)
continue;
va_start = chunk_ib->va_start & AMDGPU_GMC_HOLE_MASK;
r = amdgpu_cs_find_mapping(p, va_start, &aobj, &m);
if (r) {
DRM_ERROR("IB va_start is invalid\n");
return r;
}
if (bo->tbo.pin_count)
return 0;
if ((va_start + chunk_ib->ib_bytes) >
(m->last + 1) * AMDGPU_GPU_PAGE_SIZE) {
DRM_ERROR("IB va_start+ib_bytes is invalid\n");
return -EINVAL;
/* Don't move this buffer if we have depleted our allowance
* to move it. Don't move anything if the threshold is zero.
*/
if (p->bytes_moved < p->bytes_moved_threshold &&
(!bo->tbo.base.dma_buf ||
list_empty(&bo->tbo.base.dma_buf->attachments))) {
if (!amdgpu_gmc_vram_full_visible(&adev->gmc) &&
(bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)) {
/* And don't move a CPU_ACCESS_REQUIRED BO to limited
* visible VRAM if we've depleted our allowance to do
* that.
*/
if (p->bytes_moved_vis < p->bytes_moved_vis_threshold)
domain = bo->preferred_domains;
else
domain = bo->allowed_domains;
} else {
domain = bo->preferred_domains;
}
/* the IB should be reserved at this point */
r = amdgpu_bo_kmap(aobj, (void **)&kptr);
if (r) {
return r;
} else {
domain = bo->allowed_domains;
}
offset = m->start * AMDGPU_GPU_PAGE_SIZE;
kptr += va_start - offset;
retry:
amdgpu_bo_placement_from_domain(bo, domain);
r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
if (ring->funcs->parse_cs) {
memcpy(ib->ptr, kptr, chunk_ib->ib_bytes);
amdgpu_bo_kunmap(aobj);
p->bytes_moved += ctx.bytes_moved;
if (!amdgpu_gmc_vram_full_visible(&adev->gmc) &&
amdgpu_bo_in_cpu_visible_vram(bo))
p->bytes_moved_vis += ctx.bytes_moved;
r = amdgpu_ring_parse_cs(ring, p, p->job, ib);
if (r)
return r;
} else {
ib->ptr = (uint32_t *)kptr;
r = amdgpu_ring_patch_cs_in_place(ring, p, p->job, ib);
amdgpu_bo_kunmap(aobj);
if (r)
return r;
if (unlikely(r == -ENOMEM) && domain != bo->allowed_domains) {
domain = bo->allowed_domains;
goto retry;
}
j++;
}
}
return r;
}
if (!p->job->vm)
return amdgpu_cs_sync_rings(p);
static int amdgpu_cs_list_validate(struct amdgpu_cs_parser *p,
struct list_head *validated)
{
struct ttm_operation_ctx ctx = { true, false };
struct amdgpu_bo_list_entry *lobj;
int r;
list_for_each_entry(lobj, validated, tv.head) {
struct amdgpu_bo *bo = ttm_to_amdgpu_bo(lobj->tv.bo);
struct mm_struct *usermm;
r = amdgpu_vm_clear_freed(adev, vm, NULL);
if (r)
return r;
usermm = amdgpu_ttm_tt_get_usermm(bo->tbo.ttm);
if (usermm && usermm != current->mm)
return -EPERM;
r = amdgpu_vm_bo_update(adev, fpriv->prt_va, false);
if (amdgpu_ttm_tt_is_userptr(bo->tbo.ttm) &&
lobj->user_invalidated && lobj->user_pages) {
amdgpu_bo_placement_from_domain(bo,
AMDGPU_GEM_DOMAIN_CPU);
r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
if (r)
return r;
r = amdgpu_sync_fence(&p->job->sync, fpriv->prt_va->last_pt_update);
if (r)
return r;
amdgpu_ttm_tt_set_user_pages(bo->tbo.ttm,
lobj->user_pages);
}
if (fpriv->csa_va) {
bo_va = fpriv->csa_va;
BUG_ON(!bo_va);
r = amdgpu_vm_bo_update(adev, bo_va, false);
r = amdgpu_cs_bo_validate(p, bo);
if (r)
return r;
r = amdgpu_sync_fence(&p->job->sync, bo_va->last_pt_update);
if (r)
return r;
kvfree(lobj->user_pages);
lobj->user_pages = NULL;
}
return 0;
}
amdgpu_bo_list_for_each_entry(e, p->bo_list) {
/* ignore duplicates */
bo = ttm_to_amdgpu_bo(e->tv.bo);
if (!bo)
continue;
static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
union drm_amdgpu_cs *cs)
{
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
struct amdgpu_vm *vm = &fpriv->vm;
struct amdgpu_bo_list_entry *e;
struct list_head duplicates;
int r;
bo_va = e->bo_va;
if (bo_va == NULL)
continue;
INIT_LIST_HEAD(&p->validated);
r = amdgpu_vm_bo_update(adev, bo_va, false);
/* p->bo_list could already be assigned if AMDGPU_CHUNK_ID_BO_HANDLES is present */
if (cs->in.bo_list_handle) {
if (p->bo_list)
return -EINVAL;
r = amdgpu_bo_list_get(fpriv, cs->in.bo_list_handle,
&p->bo_list);
if (r)
return r;
r = amdgpu_sync_fence(&p->job->sync, bo_va->last_pt_update);
} else if (!p->bo_list) {
/* Create a empty bo_list when no handle is provided */
r = amdgpu_bo_list_create(p->adev, p->filp, NULL, 0,
&p->bo_list);
if (r)
return r;
}
r = amdgpu_vm_handle_moved(adev, vm);
if (r)
return r;
mutex_lock(&p->bo_list->bo_list_mutex);
r = amdgpu_vm_update_pdes(adev, vm, false);
if (r)
return r;
/* One for TTM and one for the CS job */
amdgpu_bo_list_for_each_entry(e, p->bo_list)
e->tv.num_shared = 2;
r = amdgpu_sync_fence(&p->job->sync, vm->last_update);
if (r)
return r;
amdgpu_bo_list_get_list(p->bo_list, &p->validated);
p->job->vm_pd_addr = amdgpu_gmc_pd_addr(vm->root.bo);
INIT_LIST_HEAD(&duplicates);
amdgpu_vm_get_pd_bo(&fpriv->vm, &p->validated, &p->vm_pd);
if (amdgpu_vm_debug) {
/* Invalidate all BOs to test for userspace bugs */
amdgpu_bo_list_for_each_entry(e, p->bo_list) {
struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
if (p->uf_entry.tv.bo && !ttm_to_amdgpu_bo(p->uf_entry.tv.bo)->parent)
list_add(&p->uf_entry.tv.head, &p->validated);
/* ignore duplicates */
if (!bo)
continue;
/* Get userptr backing pages. If pages are updated after registered
* in amdgpu_gem_userptr_ioctl(), amdgpu_cs_list_validate() will do
* amdgpu_ttm_backend_bind() to flush and invalidate new pages
*/
amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
bool userpage_invalidated = false;
int i;
amdgpu_vm_bo_invalidate(adev, bo, false);
e->user_pages = kvmalloc_array(bo->tbo.ttm->num_pages,
sizeof(struct page *),
GFP_KERNEL | __GFP_ZERO);
if (!e->user_pages) {
DRM_ERROR("kvmalloc_array failure\n");
r = -ENOMEM;
goto out_free_user_pages;
}
r = amdgpu_ttm_tt_get_user_pages(bo, e->user_pages);
if (r) {
kvfree(e->user_pages);
e->user_pages = NULL;
goto out_free_user_pages;
}
return amdgpu_cs_sync_rings(p);
}
for (i = 0; i < bo->tbo.ttm->num_pages; i++) {
if (bo->tbo.ttm->pages[i] != e->user_pages[i]) {
userpage_invalidated = true;
break;
}
}
e->user_invalidated = userpage_invalidated;
}
static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
struct amdgpu_cs_parser *parser)
{
struct amdgpu_fpriv *fpriv = parser->filp->driver_priv;
struct amdgpu_vm *vm = &fpriv->vm;
int r, ce_preempt = 0, de_preempt = 0;
struct amdgpu_ring *ring;
int i, j;
r = ttm_eu_reserve_buffers(&p->ticket, &p->validated, true,
&duplicates);
if (unlikely(r != 0)) {
if (r != -ERESTARTSYS)
DRM_ERROR("ttm_eu_reserve_buffers failed.\n");
goto out_free_user_pages;
}
for (i = 0, j = 0; i < parser->nchunks && j < parser->job->num_ibs; i++) {
struct amdgpu_cs_chunk *chunk;
struct amdgpu_ib *ib;
struct drm_amdgpu_cs_chunk_ib *chunk_ib;
struct drm_sched_entity *entity;
amdgpu_bo_list_for_each_entry(e, p->bo_list) {
struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
chunk = &parser->chunks[i];
ib = &parser->job->ibs[j];
chunk_ib = (struct drm_amdgpu_cs_chunk_ib *)chunk->kdata;
e->bo_va = amdgpu_vm_bo_find(vm, bo);
}
if (chunk->chunk_id != AMDGPU_CHUNK_ID_IB)
continue;
/* Move fence waiting after getting reservation lock of
* PD root. Then there is no need on a ctx mutex lock.
*/
r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entity);
if (unlikely(r != 0)) {
if (r != -ERESTARTSYS)
DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n");
goto error_validate;
}
if (chunk_ib->ip_type == AMDGPU_HW_IP_GFX &&
chunk_ib->flags & AMDGPU_IB_FLAG_PREEMPT) {
if (chunk_ib->flags & AMDGPU_IB_FLAG_CE)
ce_preempt++;
else
de_preempt++;
amdgpu_cs_get_threshold_for_moves(p->adev, &p->bytes_moved_threshold,
&p->bytes_moved_vis_threshold);
p->bytes_moved = 0;
p->bytes_moved_vis = 0;
/* each GFX command submit allows 0 or 1 IB preemptible for CE & DE */
if (ce_preempt > 1 || de_preempt > 1)
return -EINVAL;
r = amdgpu_vm_validate_pt_bos(p->adev, &fpriv->vm,
amdgpu_cs_bo_validate, p);
if (r) {
DRM_ERROR("amdgpu_vm_validate_pt_bos() failed.\n");
goto error_validate;
}
r = amdgpu_ctx_get_entity(parser->ctx, chunk_ib->ip_type,
chunk_ib->ip_instance, chunk_ib->ring,
&entity);
r = amdgpu_cs_list_validate(p, &duplicates);
if (r)
return r;
goto error_validate;
if (chunk_ib->flags & AMDGPU_IB_FLAG_PREAMBLE)
parser->job->preamble_status |=
AMDGPU_PREAMBLE_IB_PRESENT;
r = amdgpu_cs_list_validate(p, &p->validated);
if (r)
goto error_validate;
if (parser->entity && parser->entity != entity)
return -EINVAL;
amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
p->bytes_moved_vis);
/* Return if there is no run queue associated with this entity.
* Possibly because of disabled HW IP*/
if (entity->rq == NULL)
return -EINVAL;
amdgpu_job_set_resources(p->job, p->bo_list->gds_obj,
p->bo_list->gws_obj, p->bo_list->oa_obj);
parser->entity = entity;
if (!r && p->uf_entry.tv.bo) {
struct amdgpu_bo *uf = ttm_to_amdgpu_bo(p->uf_entry.tv.bo);
ring = to_amdgpu_ring(entity->rq->sched);
r = amdgpu_ib_get(adev, vm, ring->funcs->parse_cs ?
chunk_ib->ib_bytes : 0,
AMDGPU_IB_POOL_DELAYED, ib);
if (r) {
DRM_ERROR("Failed to get ib !\n");
return r;
r = amdgpu_ttm_alloc_gart(&uf->tbo);
p->job->uf_addr += amdgpu_bo_gpu_offset(uf);
}
ib->gpu_addr = chunk_ib->va_start;
ib->length_dw = chunk_ib->ib_bytes / 4;
ib->flags = chunk_ib->flags;
j++;
}
error_validate:
if (r)
ttm_eu_backoff_reservation(&p->ticket, &p->validated);
/* MM engine doesn't support user fences */
ring = to_amdgpu_ring(parser->entity->rq->sched);
if (parser->job->uf_addr && ring->funcs->no_user_fence)
return -EINVAL;
out_free_user_pages:
if (r) {
amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
return 0;
if (!e->user_pages)
continue;
amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
kvfree(e->user_pages);
e->user_pages = NULL;
}
mutex_unlock(&p->bo_list->bo_list_mutex);
}
return r;
}
static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p,
struct amdgpu_cs_chunk *chunk)
static void trace_amdgpu_cs_ibs(struct amdgpu_cs_parser *parser)
{
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
unsigned num_deps;
int i, r;
struct drm_amdgpu_cs_chunk_dep *deps;
deps = (struct drm_amdgpu_cs_chunk_dep *)chunk->kdata;
num_deps = chunk->length_dw * 4 /
sizeof(struct drm_amdgpu_cs_chunk_dep);
int i;
for (i = 0; i < num_deps; ++i) {
struct amdgpu_ctx *ctx;
struct drm_sched_entity *entity;
struct dma_fence *fence;
if (!trace_amdgpu_cs_enabled())
return;
ctx = amdgpu_ctx_get(fpriv, deps[i].ctx_id);
if (ctx == NULL)
return -EINVAL;
for (i = 0; i < parser->job->num_ibs; i++)
trace_amdgpu_cs(parser, i);
}
r = amdgpu_ctx_get_entity(ctx, deps[i].ip_type,
deps[i].ip_instance,
deps[i].ring, &entity);
if (r) {
amdgpu_ctx_put(ctx);
return r;
}
static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
{
struct amdgpu_ring *ring = to_amdgpu_ring(p->entity->rq->sched);
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
struct amdgpu_device *adev = p->adev;
struct amdgpu_vm *vm = &fpriv->vm;
struct amdgpu_bo_list_entry *e;
struct amdgpu_bo_va *bo_va;
struct amdgpu_bo *bo;
int r;
fence = amdgpu_ctx_get_fence(ctx, entity, deps[i].handle);
amdgpu_ctx_put(ctx);
/* Only for UVD/VCE VM emulation */
if (ring->funcs->parse_cs || ring->funcs->patch_cs_in_place) {
unsigned i, j;
if (IS_ERR(fence))
return PTR_ERR(fence);
else if (!fence)
continue;
for (i = 0, j = 0; i < p->nchunks && j < p->job->num_ibs; i++) {
struct drm_amdgpu_cs_chunk_ib *chunk_ib;
struct amdgpu_bo_va_mapping *m;
struct amdgpu_bo *aobj = NULL;
struct amdgpu_cs_chunk *chunk;
uint64_t offset, va_start;
struct amdgpu_ib *ib;
uint8_t *kptr;
if (chunk->chunk_id == AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES) {
struct drm_sched_fence *s_fence;
struct dma_fence *old = fence;
chunk = &p->chunks[i];
ib = &p->job->ibs[j];
chunk_ib = chunk->kdata;
s_fence = to_drm_sched_fence(fence);
fence = dma_fence_get(&s_fence->scheduled);
dma_fence_put(old);
}
if (chunk->chunk_id != AMDGPU_CHUNK_ID_IB)
continue;
r = amdgpu_sync_fence(&p->job->sync, fence);
dma_fence_put(fence);
if (r)
va_start = chunk_ib->va_start & AMDGPU_GMC_HOLE_MASK;
r = amdgpu_cs_find_mapping(p, va_start, &aobj, &m);
if (r) {
DRM_ERROR("IB va_start is invalid\n");
return r;
}
return 0;
}
static int amdgpu_syncobj_lookup_and_add_to_sync(struct amdgpu_cs_parser *p,
uint32_t handle, u64 point,
u64 flags)
{
struct dma_fence *fence;
int r;
if ((va_start + chunk_ib->ib_bytes) >
(m->last + 1) * AMDGPU_GPU_PAGE_SIZE) {
DRM_ERROR("IB va_start+ib_bytes is invalid\n");
return -EINVAL;
}
r = drm_syncobj_find_fence(p->filp, handle, point, flags, &fence);
/* the IB should be reserved at this point */
r = amdgpu_bo_kmap(aobj, (void **)&kptr);
if (r) {
DRM_ERROR("syncobj %u failed to find fence @ %llu (%d)!\n",
handle, point, r);
return r;
}
r = amdgpu_sync_fence(&p->job->sync, fence);
dma_fence_put(fence);
return r;
}
offset = m->start * AMDGPU_GPU_PAGE_SIZE;
kptr += va_start - offset;
static int amdgpu_cs_process_syncobj_in_dep(struct amdgpu_cs_parser *p,
struct amdgpu_cs_chunk *chunk)
{
struct drm_amdgpu_cs_chunk_sem *deps;
unsigned num_deps;
int i, r;
if (ring->funcs->parse_cs) {
memcpy(ib->ptr, kptr, chunk_ib->ib_bytes);
amdgpu_bo_kunmap(aobj);
deps = (struct drm_amdgpu_cs_chunk_sem *)chunk->kdata;
num_deps = chunk->length_dw * 4 /
sizeof(struct drm_amdgpu_cs_chunk_sem);
for (i = 0; i < num_deps; ++i) {
r = amdgpu_syncobj_lookup_and_add_to_sync(p, deps[i].handle,
0, 0);
r = amdgpu_ring_parse_cs(ring, p, p->job, ib);
if (r)
return r;
} else {
ib->ptr = (uint32_t *)kptr;
r = amdgpu_ring_patch_cs_in_place(ring, p, p->job, ib);
amdgpu_bo_kunmap(aobj);
if (r)
return r;
}
return 0;
}
j++;
}
}
static int amdgpu_cs_process_syncobj_timeline_in_dep(struct amdgpu_cs_parser *p,
struct amdgpu_cs_chunk *chunk)
{
struct drm_amdgpu_cs_chunk_syncobj *syncobj_deps;
unsigned num_deps;
int i, r;
if (!p->job->vm)
return 0;
syncobj_deps = (struct drm_amdgpu_cs_chunk_syncobj *)chunk->kdata;
num_deps = chunk->length_dw * 4 /
sizeof(struct drm_amdgpu_cs_chunk_syncobj);
for (i = 0; i < num_deps; ++i) {
r = amdgpu_syncobj_lookup_and_add_to_sync(p,
syncobj_deps[i].handle,
syncobj_deps[i].point,
syncobj_deps[i].flags);
r = amdgpu_vm_clear_freed(adev, vm, NULL);
if (r)
return r;
}
return 0;
}
r = amdgpu_vm_bo_update(adev, fpriv->prt_va, false);
if (r)
return r;
static int amdgpu_cs_process_syncobj_out_dep(struct amdgpu_cs_parser *p,
struct amdgpu_cs_chunk *chunk)
{
struct drm_amdgpu_cs_chunk_sem *deps;
unsigned num_deps;
int i;
r = amdgpu_sync_fence(&p->job->sync, fpriv->prt_va->last_pt_update);
if (r)
return r;
deps = (struct drm_amdgpu_cs_chunk_sem *)chunk->kdata;
num_deps = chunk->length_dw * 4 /
sizeof(struct drm_amdgpu_cs_chunk_sem);
if (fpriv->csa_va) {
bo_va = fpriv->csa_va;
BUG_ON(!bo_va);
r = amdgpu_vm_bo_update(adev, bo_va, false);
if (r)
return r;
if (p->post_deps)
return -EINVAL;
r = amdgpu_sync_fence(&p->job->sync, bo_va->last_pt_update);
if (r)
return r;
}
p->post_deps = kmalloc_array(num_deps, sizeof(*p->post_deps),
GFP_KERNEL);
p->num_post_deps = 0;
amdgpu_bo_list_for_each_entry(e, p->bo_list) {
/* ignore duplicates */
bo = ttm_to_amdgpu_bo(e->tv.bo);
if (!bo)
continue;
if (!p->post_deps)
return -ENOMEM;
bo_va = e->bo_va;
if (bo_va == NULL)
continue;
r = amdgpu_vm_bo_update(adev, bo_va, false);
if (r)
return r;
for (i = 0; i < num_deps; ++i) {
p->post_deps[i].syncobj =
drm_syncobj_find(p->filp, deps[i].handle);
if (!p->post_deps[i].syncobj)
return -EINVAL;
p->post_deps[i].chain = NULL;
p->post_deps[i].point = 0;
p->num_post_deps++;
r = amdgpu_sync_fence(&p->job->sync, bo_va->last_pt_update);
if (r)
return r;
}
return 0;
}
static int amdgpu_cs_process_syncobj_timeline_out_dep(struct amdgpu_cs_parser *p,
struct amdgpu_cs_chunk *chunk)
{
struct drm_amdgpu_cs_chunk_syncobj *syncobj_deps;
unsigned num_deps;
int i;
syncobj_deps = (struct drm_amdgpu_cs_chunk_syncobj *)chunk->kdata;
num_deps = chunk->length_dw * 4 /
sizeof(struct drm_amdgpu_cs_chunk_syncobj);
r = amdgpu_vm_handle_moved(adev, vm);
if (r)
return r;
if (p->post_deps)
return -EINVAL;
r = amdgpu_vm_update_pdes(adev, vm, false);
if (r)
return r;
p->post_deps = kmalloc_array(num_deps, sizeof(*p->post_deps),
GFP_KERNEL);
p->num_post_deps = 0;
r = amdgpu_sync_fence(&p->job->sync, vm->last_update);
if (r)
return r;
if (!p->post_deps)
return -ENOMEM;
p->job->vm_pd_addr = amdgpu_gmc_pd_addr(vm->root.bo);
for (i = 0; i < num_deps; ++i) {
struct amdgpu_cs_post_dep *dep = &p->post_deps[i];
if (amdgpu_vm_debug) {
/* Invalidate all BOs to test for userspace bugs */
amdgpu_bo_list_for_each_entry(e, p->bo_list) {
struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
dep->chain = NULL;
if (syncobj_deps[i].point) {
dep->chain = dma_fence_chain_alloc();
if (!dep->chain)
return -ENOMEM;
}
/* ignore duplicates */
if (!bo)
continue;
dep->syncobj = drm_syncobj_find(p->filp,
syncobj_deps[i].handle);
if (!dep->syncobj) {
dma_fence_chain_free(dep->chain);
return -EINVAL;
amdgpu_vm_bo_invalidate(adev, bo, false);
}
dep->point = syncobj_deps[i].point;
p->num_post_deps++;
}
return 0;
}
static int amdgpu_cs_dependencies(struct amdgpu_device *adev,
struct amdgpu_cs_parser *p)
static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
{
int i, r;
for (i = 0; i < p->nchunks; ++i) {
struct amdgpu_cs_chunk *chunk;
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
struct amdgpu_bo_list_entry *e;
int r;
chunk = &p->chunks[i];
list_for_each_entry(e, &p->validated, tv.head) {
struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
struct dma_resv *resv = bo->tbo.base.resv;
enum amdgpu_sync_mode sync_mode;
switch (chunk->chunk_id) {
case AMDGPU_CHUNK_ID_DEPENDENCIES:
case AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES:
r = amdgpu_cs_process_fence_dep(p, chunk);
if (r)
return r;
break;
case AMDGPU_CHUNK_ID_SYNCOBJ_IN:
r = amdgpu_cs_process_syncobj_in_dep(p, chunk);
if (r)
return r;
break;
case AMDGPU_CHUNK_ID_SYNCOBJ_OUT:
r = amdgpu_cs_process_syncobj_out_dep(p, chunk);
if (r)
return r;
break;
case AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT:
r = amdgpu_cs_process_syncobj_timeline_in_dep(p, chunk);
if (r)
return r;
break;
case AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL:
r = amdgpu_cs_process_syncobj_timeline_out_dep(p, chunk);
sync_mode = amdgpu_bo_explicit_sync(bo) ?
AMDGPU_SYNC_EXPLICIT : AMDGPU_SYNC_NE_OWNER;
r = amdgpu_sync_resv(p->adev, &p->job->sync, resv, sync_mode,
&fpriv->vm);
if (r)
return r;
break;
}
}
return 0;
}
......@@ -1243,17 +1252,6 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
return r;
}
static void trace_amdgpu_cs_ibs(struct amdgpu_cs_parser *parser)
{
int i;
if (!trace_amdgpu_cs_enabled())
return;
for (i = 0; i < parser->job->num_ibs; i++)
trace_amdgpu_cs(parser, i);
}
/* Cleanup the parser structure */
static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error,
bool backoff)
......@@ -1342,6 +1340,10 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
if (r)
goto out;
r = amdgpu_cs_sync_rings(&parser);
if (r)
goto out;
r = amdgpu_cs_submit(&parser, data);
out:
amdgpu_cs_parser_fini(&parser, r, reserved_buffers);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment