Commit be254550 authored by Jiadong.Zhu's avatar Jiadong.Zhu Committed by Alex Deucher

drm/amdgpu: Modify unmap_queue format for gfx9 (v6)

1. Modify the unmap_queue package on gfx9. Add trailing fence to track the
   preemption done.
2. Modify emit_ce_meta emit_de_meta functions for the resumed ibs.

v2: Restyle code not to use ternary operator.
v3: Modify code format.
v4: Enable Mid-Command Buffer Preemption for gfx9 by default.
v5: Optimize the flag bit set for emit_fence.
v6: Modify log message for preemption timeout.

Cc: Christian Koenig <Christian.Koenig@amd.com>
Cc: Michel Dänzer <michel@daenzer.net>
Cc: Luben Tuikov <Luben.Tuikov@amd.com>
Signed-off-by: default avatarJiadong.Zhu <Jiadong.Zhu@amd.com>
Acked-by: default avatarChristian König <christian.koenig@amd.com>
Acked-by: default avatarHuang Rui <ray.huang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 0c97a19a
...@@ -60,6 +60,7 @@ enum amdgpu_ring_priority_level { ...@@ -60,6 +60,7 @@ enum amdgpu_ring_priority_level {
#define AMDGPU_FENCE_FLAG_64BIT (1 << 0) #define AMDGPU_FENCE_FLAG_64BIT (1 << 0)
#define AMDGPU_FENCE_FLAG_INT (1 << 1) #define AMDGPU_FENCE_FLAG_INT (1 << 1)
#define AMDGPU_FENCE_FLAG_TC_WB_ONLY (1 << 2) #define AMDGPU_FENCE_FLAG_TC_WB_ONLY (1 << 2)
#define AMDGPU_FENCE_FLAG_EXEC (1 << 3)
#define to_amdgpu_ring(s) container_of((s), struct amdgpu_ring, sched) #define to_amdgpu_ring(s) container_of((s), struct amdgpu_ring, sched)
......
...@@ -755,7 +755,7 @@ static void gfx_v9_0_set_rlc_funcs(struct amdgpu_device *adev); ...@@ -755,7 +755,7 @@ static void gfx_v9_0_set_rlc_funcs(struct amdgpu_device *adev);
static int gfx_v9_0_get_cu_info(struct amdgpu_device *adev, static int gfx_v9_0_get_cu_info(struct amdgpu_device *adev,
struct amdgpu_cu_info *cu_info); struct amdgpu_cu_info *cu_info);
static uint64_t gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device *adev); static uint64_t gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device *adev);
static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring); static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume);
static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring); static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring);
static void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev, static void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
void *ras_error_status); void *ras_error_status);
...@@ -828,9 +828,10 @@ static void gfx_v9_0_kiq_unmap_queues(struct amdgpu_ring *kiq_ring, ...@@ -828,9 +828,10 @@ static void gfx_v9_0_kiq_unmap_queues(struct amdgpu_ring *kiq_ring,
PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index)); PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index));
if (action == PREEMPT_QUEUES_NO_UNMAP) { if (action == PREEMPT_QUEUES_NO_UNMAP) {
amdgpu_ring_write(kiq_ring, lower_32_bits(gpu_addr)); amdgpu_ring_write(kiq_ring, lower_32_bits(ring->wptr & ring->buf_mask));
amdgpu_ring_write(kiq_ring, upper_32_bits(gpu_addr)); amdgpu_ring_write(kiq_ring, 0);
amdgpu_ring_write(kiq_ring, seq); amdgpu_ring_write(kiq_ring, 0);
} else { } else {
amdgpu_ring_write(kiq_ring, 0); amdgpu_ring_write(kiq_ring, 0);
amdgpu_ring_write(kiq_ring, 0); amdgpu_ring_write(kiq_ring, 0);
...@@ -5204,11 +5205,17 @@ static void gfx_v9_0_ring_emit_ib_gfx(struct amdgpu_ring *ring, ...@@ -5204,11 +5205,17 @@ static void gfx_v9_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
control |= ib->length_dw | (vmid << 24); control |= ib->length_dw | (vmid << 24);
if (amdgpu_sriov_vf(ring->adev) && (ib->flags & AMDGPU_IB_FLAG_PREEMPT)) { if (ib->flags & AMDGPU_IB_FLAG_PREEMPT) {
control |= INDIRECT_BUFFER_PRE_ENB(1); control |= INDIRECT_BUFFER_PRE_ENB(1);
if (flags & AMDGPU_IB_PREEMPTED)
control |= INDIRECT_BUFFER_PRE_RESUME(1);
if (!(ib->flags & AMDGPU_IB_FLAG_CE) && vmid) if (!(ib->flags & AMDGPU_IB_FLAG_CE) && vmid)
gfx_v9_0_ring_emit_de_meta(ring); gfx_v9_0_ring_emit_de_meta(ring,
(!amdgpu_sriov_vf(ring->adev) &&
flags & AMDGPU_IB_PREEMPTED) ?
true : false);
} }
amdgpu_ring_write(ring, header); amdgpu_ring_write(ring, header);
...@@ -5263,17 +5270,24 @@ static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr, ...@@ -5263,17 +5270,24 @@ static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr,
bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT; bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
bool int_sel = flags & AMDGPU_FENCE_FLAG_INT; bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
bool writeback = flags & AMDGPU_FENCE_FLAG_TC_WB_ONLY; bool writeback = flags & AMDGPU_FENCE_FLAG_TC_WB_ONLY;
bool exec = flags & AMDGPU_FENCE_FLAG_EXEC;
uint32_t dw2 = 0;
/* RELEASE_MEM - flush caches, send int */ /* RELEASE_MEM - flush caches, send int */
amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 6)); amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 6));
amdgpu_ring_write(ring, ((writeback ? (EOP_TC_WB_ACTION_EN |
EOP_TC_NC_ACTION_EN) : if (writeback) {
(EOP_TCL1_ACTION_EN | dw2 = EOP_TC_NC_ACTION_EN;
EOP_TC_ACTION_EN | } else {
EOP_TC_WB_ACTION_EN | dw2 = EOP_TCL1_ACTION_EN | EOP_TC_ACTION_EN |
EOP_TC_MD_ACTION_EN)) | EOP_TC_MD_ACTION_EN;
EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) | }
EVENT_INDEX(5))); dw2 |= EOP_TC_WB_ACTION_EN | EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) |
EVENT_INDEX(5);
if (exec)
dw2 |= EOP_EXEC;
amdgpu_ring_write(ring, dw2);
amdgpu_ring_write(ring, DATA_SEL(write64bit ? 2 : 1) | INT_SEL(int_sel ? 2 : 0)); amdgpu_ring_write(ring, DATA_SEL(write64bit ? 2 : 1) | INT_SEL(int_sel ? 2 : 0));
/* /*
...@@ -5378,33 +5392,135 @@ static void gfx_v9_ring_emit_sb(struct amdgpu_ring *ring) ...@@ -5378,33 +5392,135 @@ static void gfx_v9_ring_emit_sb(struct amdgpu_ring *ring)
amdgpu_ring_write(ring, 0); amdgpu_ring_write(ring, 0);
} }
static void gfx_v9_0_ring_emit_ce_meta(struct amdgpu_ring *ring) static void gfx_v9_0_ring_emit_ce_meta(struct amdgpu_ring *ring, bool resume)
{ {
struct amdgpu_device *adev = ring->adev;
struct v9_ce_ib_state ce_payload = {0}; struct v9_ce_ib_state ce_payload = {0};
uint64_t csa_addr; uint64_t offset, ce_payload_gpu_addr;
void *ce_payload_cpu_addr;
int cnt; int cnt;
cnt = (sizeof(ce_payload) >> 2) + 4 - 2; cnt = (sizeof(ce_payload) >> 2) + 4 - 2;
csa_addr = amdgpu_csa_vaddr(ring->adev);
if (ring->is_mes_queue) {
offset = offsetof(struct amdgpu_mes_ctx_meta_data,
gfx[0].gfx_meta_data) +
offsetof(struct v9_gfx_meta_data, ce_payload);
ce_payload_gpu_addr =
amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
ce_payload_cpu_addr =
amdgpu_mes_ctx_get_offs_cpu_addr(ring, offset);
} else {
offset = offsetof(struct v9_gfx_meta_data, ce_payload);
ce_payload_gpu_addr = amdgpu_csa_vaddr(ring->adev) + offset;
ce_payload_cpu_addr = adev->virt.csa_cpu_addr + offset;
}
amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, cnt)); amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, cnt));
amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(2) | amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(2) |
WRITE_DATA_DST_SEL(8) | WRITE_DATA_DST_SEL(8) |
WR_CONFIRM) | WR_CONFIRM) |
WRITE_DATA_CACHE_POLICY(0)); WRITE_DATA_CACHE_POLICY(0));
amdgpu_ring_write(ring, lower_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, ce_payload))); amdgpu_ring_write(ring, lower_32_bits(ce_payload_gpu_addr));
amdgpu_ring_write(ring, upper_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, ce_payload))); amdgpu_ring_write(ring, upper_32_bits(ce_payload_gpu_addr));
amdgpu_ring_write_multiple(ring, (void *)&ce_payload, sizeof(ce_payload) >> 2);
if (resume)
amdgpu_ring_write_multiple(ring, ce_payload_cpu_addr,
sizeof(ce_payload) >> 2);
else
amdgpu_ring_write_multiple(ring, (void *)&ce_payload,
sizeof(ce_payload) >> 2);
} }
static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring) static int gfx_v9_0_ring_preempt_ib(struct amdgpu_ring *ring)
{ {
int i, r = 0;
struct amdgpu_device *adev = ring->adev;
struct amdgpu_kiq *kiq = &adev->gfx.kiq;
struct amdgpu_ring *kiq_ring = &kiq->ring;
unsigned long flags;
if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
return -EINVAL;
spin_lock_irqsave(&kiq->ring_lock, flags);
if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size)) {
spin_unlock_irqrestore(&kiq->ring_lock, flags);
return -ENOMEM;
}
/* assert preemption condition */
amdgpu_ring_set_preempt_cond_exec(ring, false);
ring->trail_seq += 1;
amdgpu_ring_alloc(ring, 13);
gfx_v9_0_ring_emit_fence(ring, ring->trail_fence_gpu_addr,
ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC);
/*reset the CP_VMID_PREEMPT after trailing fence*/
amdgpu_ring_emit_wreg(ring,
SOC15_REG_OFFSET(GC, 0, mmCP_VMID_PREEMPT),
0x0);
/* assert IB preemption, emit the trailing fence */
kiq->pmf->kiq_unmap_queues(kiq_ring, ring, PREEMPT_QUEUES_NO_UNMAP,
ring->trail_fence_gpu_addr,
ring->trail_seq);
amdgpu_ring_commit(kiq_ring);
spin_unlock_irqrestore(&kiq->ring_lock, flags);
/* poll the trailing fence */
for (i = 0; i < adev->usec_timeout; i++) {
if (ring->trail_seq ==
le32_to_cpu(*ring->trail_fence_cpu_addr))
break;
udelay(1);
}
if (i >= adev->usec_timeout) {
r = -EINVAL;
DRM_WARN("ring %d timeout to preempt ib\n", ring->idx);
}
amdgpu_ring_commit(ring);
/* deassert preemption condition */
amdgpu_ring_set_preempt_cond_exec(ring, true);
return r;
}
static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume)
{
struct amdgpu_device *adev = ring->adev;
struct v9_de_ib_state de_payload = {0}; struct v9_de_ib_state de_payload = {0};
uint64_t csa_addr, gds_addr; uint64_t offset, gds_addr, de_payload_gpu_addr;
void *de_payload_cpu_addr;
int cnt; int cnt;
csa_addr = amdgpu_csa_vaddr(ring->adev); if (ring->is_mes_queue) {
gds_addr = csa_addr + 4096; offset = offsetof(struct amdgpu_mes_ctx_meta_data,
gfx[0].gfx_meta_data) +
offsetof(struct v9_gfx_meta_data, de_payload);
de_payload_gpu_addr =
amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
de_payload_cpu_addr =
amdgpu_mes_ctx_get_offs_cpu_addr(ring, offset);
offset = offsetof(struct amdgpu_mes_ctx_meta_data,
gfx[0].gds_backup) +
offsetof(struct v9_gfx_meta_data, de_payload);
gds_addr = amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
} else {
offset = offsetof(struct v9_gfx_meta_data, de_payload);
de_payload_gpu_addr = amdgpu_csa_vaddr(ring->adev) + offset;
de_payload_cpu_addr = adev->virt.csa_cpu_addr + offset;
gds_addr = ALIGN(amdgpu_csa_vaddr(ring->adev) +
AMDGPU_CSA_SIZE - adev->gds.gds_size,
PAGE_SIZE);
}
de_payload.gds_backup_addrlo = lower_32_bits(gds_addr); de_payload.gds_backup_addrlo = lower_32_bits(gds_addr);
de_payload.gds_backup_addrhi = upper_32_bits(gds_addr); de_payload.gds_backup_addrhi = upper_32_bits(gds_addr);
...@@ -5414,9 +5530,15 @@ static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring) ...@@ -5414,9 +5530,15 @@ static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring)
WRITE_DATA_DST_SEL(8) | WRITE_DATA_DST_SEL(8) |
WR_CONFIRM) | WR_CONFIRM) |
WRITE_DATA_CACHE_POLICY(0)); WRITE_DATA_CACHE_POLICY(0));
amdgpu_ring_write(ring, lower_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, de_payload))); amdgpu_ring_write(ring, lower_32_bits(de_payload_gpu_addr));
amdgpu_ring_write(ring, upper_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, de_payload))); amdgpu_ring_write(ring, upper_32_bits(de_payload_gpu_addr));
amdgpu_ring_write_multiple(ring, (void *)&de_payload, sizeof(de_payload) >> 2);
if (resume)
amdgpu_ring_write_multiple(ring, de_payload_cpu_addr,
sizeof(de_payload) >> 2);
else
amdgpu_ring_write_multiple(ring, (void *)&de_payload,
sizeof(de_payload) >> 2);
} }
static void gfx_v9_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start, static void gfx_v9_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
...@@ -5432,8 +5554,9 @@ static void gfx_v9_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags) ...@@ -5432,8 +5554,9 @@ static void gfx_v9_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
{ {
uint32_t dw2 = 0; uint32_t dw2 = 0;
if (amdgpu_sriov_vf(ring->adev)) gfx_v9_0_ring_emit_ce_meta(ring,
gfx_v9_0_ring_emit_ce_meta(ring); (!amdgpu_sriov_vf(ring->adev) &&
flags & AMDGPU_IB_PREEMPTED) ? true : false);
dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */ dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
if (flags & AMDGPU_HAVE_CTX_SWITCH) { if (flags & AMDGPU_HAVE_CTX_SWITCH) {
...@@ -6760,6 +6883,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = { ...@@ -6760,6 +6883,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
.emit_cntxcntl = gfx_v9_ring_emit_cntxcntl, .emit_cntxcntl = gfx_v9_ring_emit_cntxcntl,
.init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec, .init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
.patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec, .patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
.preempt_ib = gfx_v9_0_ring_preempt_ib,
.emit_frame_cntl = gfx_v9_0_ring_emit_frame_cntl, .emit_frame_cntl = gfx_v9_0_ring_emit_frame_cntl,
.emit_wreg = gfx_v9_0_ring_emit_wreg, .emit_wreg = gfx_v9_0_ring_emit_wreg,
.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait, .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
......
...@@ -162,6 +162,7 @@ ...@@ -162,6 +162,7 @@
* 2 - Bypass * 2 - Bypass
*/ */
#define INDIRECT_BUFFER_PRE_ENB(x) ((x) << 21) #define INDIRECT_BUFFER_PRE_ENB(x) ((x) << 21)
#define INDIRECT_BUFFER_PRE_RESUME(x) ((x) << 30)
#define PACKET3_COPY_DATA 0x40 #define PACKET3_COPY_DATA 0x40
#define PACKET3_PFP_SYNC_ME 0x42 #define PACKET3_PFP_SYNC_ME 0x42
#define PACKET3_COND_WRITE 0x45 #define PACKET3_COND_WRITE 0x45
...@@ -184,6 +185,7 @@ ...@@ -184,6 +185,7 @@
#define EOP_TC_ACTION_EN (1 << 17) /* L2 */ #define EOP_TC_ACTION_EN (1 << 17) /* L2 */
#define EOP_TC_NC_ACTION_EN (1 << 19) #define EOP_TC_NC_ACTION_EN (1 << 19)
#define EOP_TC_MD_ACTION_EN (1 << 21) /* L2 metadata */ #define EOP_TC_MD_ACTION_EN (1 << 21) /* L2 metadata */
#define EOP_EXEC (1 << 28) /* For Trailing Fence */
#define DATA_SEL(x) ((x) << 29) #define DATA_SEL(x) ((x) << 29)
/* 0 - discard /* 0 - discard
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment