Commit 4624459c authored by Christian König's avatar Christian König Committed by Alex Deucher

drm/amdgpu: add gang submit frontend v6

Allows submitting jobs as gang which needs to run on multiple engines at the
same time.

All members of the gang get the same implicit, explicit and VM dependencies. So
no gang member will start running until everything else is ready.

The last job is considered the gang leader (usually a submission to the GFX
ring) and used for signaling output dependencies.

Each job is remembered individually as user of a buffer object, so there is no
joining of work at the end.

v2: rebase and fix review comments from Andrey and Yogesh
v3: use READ instead of BOOKKEEP for now because of VM unmaps, set gang
    leader only when necessary
v4: fix order of pushing jobs and adding fences found by Trigger.
v5: fix job index calculation and adding IBs to jobs
v6: fix typo found by Alex
Signed-off-by: default avatarChristian König <christian.koenig@amd.com>
Reviewed-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 68ce8b24
...@@ -686,6 +686,7 @@ int amdgpu_amdkfd_submit_ib(struct amdgpu_device *adev, ...@@ -686,6 +686,7 @@ int amdgpu_amdkfd_submit_ib(struct amdgpu_device *adev,
ib->length_dw = ib_len; ib->length_dw = ib_len;
/* This works for NO_HWS. TODO: need to handle without knowing VMID */ /* This works for NO_HWS. TODO: need to handle without knowing VMID */
job->vmid = vmid; job->vmid = vmid;
job->num_ibs = 1;
ret = amdgpu_ib_schedule(ring, 1, ib, job, &f); ret = amdgpu_ib_schedule(ring, 1, ib, job, &f);
......
This diff is collapsed.
...@@ -27,6 +27,8 @@ ...@@ -27,6 +27,8 @@
#include "amdgpu_bo_list.h" #include "amdgpu_bo_list.h"
#include "amdgpu_ring.h" #include "amdgpu_ring.h"
#define AMDGPU_CS_GANG_SIZE 4
struct amdgpu_bo_va_mapping; struct amdgpu_bo_va_mapping;
struct amdgpu_cs_chunk { struct amdgpu_cs_chunk {
...@@ -50,9 +52,11 @@ struct amdgpu_cs_parser { ...@@ -50,9 +52,11 @@ struct amdgpu_cs_parser {
unsigned nchunks; unsigned nchunks;
struct amdgpu_cs_chunk *chunks; struct amdgpu_cs_chunk *chunks;
/* scheduler job object */ /* scheduler job objects */
struct amdgpu_job *job; unsigned int gang_size;
struct drm_sched_entity *entity; struct drm_sched_entity *entities[AMDGPU_CS_GANG_SIZE];
struct amdgpu_job *jobs[AMDGPU_CS_GANG_SIZE];
struct amdgpu_job *gang_leader;
/* buffer objects */ /* buffer objects */
struct ww_acquire_ctx ticket; struct ww_acquire_ctx ticket;
......
...@@ -105,7 +105,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, ...@@ -105,7 +105,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
*/ */
(*job)->base.sched = &adev->rings[0]->sched; (*job)->base.sched = &adev->rings[0]->sched;
(*job)->vm = vm; (*job)->vm = vm;
(*job)->num_ibs = num_ibs;
amdgpu_sync_create(&(*job)->sync); amdgpu_sync_create(&(*job)->sync);
amdgpu_sync_create(&(*job)->sched_sync); amdgpu_sync_create(&(*job)->sched_sync);
...@@ -125,6 +124,7 @@ int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned size, ...@@ -125,6 +124,7 @@ int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned size,
if (r) if (r)
return r; return r;
(*job)->num_ibs = 1;
r = amdgpu_ib_get(adev, NULL, size, pool_type, &(*job)->ibs[0]); r = amdgpu_ib_get(adev, NULL, size, pool_type, &(*job)->ibs[0]);
if (r) if (r)
kfree(*job); kfree(*job);
......
...@@ -140,8 +140,10 @@ TRACE_EVENT(amdgpu_bo_create, ...@@ -140,8 +140,10 @@ TRACE_EVENT(amdgpu_bo_create,
); );
TRACE_EVENT(amdgpu_cs, TRACE_EVENT(amdgpu_cs,
TP_PROTO(struct amdgpu_cs_parser *p, int i), TP_PROTO(struct amdgpu_cs_parser *p,
TP_ARGS(p, i), struct amdgpu_job *job,
struct amdgpu_ib *ib),
TP_ARGS(p, job, ib),
TP_STRUCT__entry( TP_STRUCT__entry(
__field(struct amdgpu_bo_list *, bo_list) __field(struct amdgpu_bo_list *, bo_list)
__field(u32, ring) __field(u32, ring)
...@@ -151,10 +153,10 @@ TRACE_EVENT(amdgpu_cs, ...@@ -151,10 +153,10 @@ TRACE_EVENT(amdgpu_cs,
TP_fast_assign( TP_fast_assign(
__entry->bo_list = p->bo_list; __entry->bo_list = p->bo_list;
__entry->ring = to_amdgpu_ring(p->entity->rq->sched)->idx; __entry->ring = to_amdgpu_ring(job->base.sched)->idx;
__entry->dw = p->job->ibs[i].length_dw; __entry->dw = ib->length_dw;
__entry->fences = amdgpu_fence_count_emitted( __entry->fences = amdgpu_fence_count_emitted(
to_amdgpu_ring(p->entity->rq->sched)); to_amdgpu_ring(job->base.sched));
), ),
TP_printk("bo_list=%p, ring=%u, dw=%u, fences=%u", TP_printk("bo_list=%p, ring=%u, dw=%u, fences=%u",
__entry->bo_list, __entry->ring, __entry->dw, __entry->bo_list, __entry->ring, __entry->dw,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment