Commit 81202807 authored by Dennis Li's avatar Dennis Li Committed by Alex Deucher

drm/amdgpu: block ring buffer access during GPU recovery

When GPU is in reset, its status isn't stable and ring buffer also need
be reset when resuming. Therefore driver should protect GPU recovery
thread from ring buffer accessed by other threads. Otherwise GPU will
randomly hang during recovery.

v2: correct indent
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: default avatarChristian König <christian.koenig@amd.com>
Signed-off-by: default avatarDennis Li <Dennis.Li@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent f6eb4339
...@@ -319,8 +319,12 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, ...@@ -319,8 +319,12 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
{ {
uint32_t ret; uint32_t ret;
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) &&
return amdgpu_kiq_rreg(adev, reg); down_read_trylock(&adev->reset_sem)) {
ret = amdgpu_kiq_rreg(adev, reg);
up_read(&adev->reset_sem);
return ret;
}
if ((reg * 4) < adev->rmmio_size) if ((reg * 4) < adev->rmmio_size)
ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
...@@ -332,6 +336,7 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, ...@@ -332,6 +336,7 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
} }
trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret); trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
return ret; return ret;
} }
...@@ -409,8 +414,12 @@ static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, ...@@ -409,8 +414,12 @@ static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device *adev,
void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
uint32_t acc_flags) uint32_t acc_flags)
{ {
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) &&
return amdgpu_kiq_wreg(adev, reg, v); down_read_trylock(&adev->reset_sem)) {
amdgpu_kiq_wreg(adev, reg, v);
up_read(&adev->reset_sem);
return;
}
amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags); amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
} }
......
...@@ -287,8 +287,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, ...@@ -287,8 +287,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
*/ */
if (adev->gfx.kiq.ring.sched.ready && if (adev->gfx.kiq.ring.sched.ready &&
(amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) && (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
!amdgpu_in_reset(adev)) { down_read_trylock(&adev->reset_sem)) {
struct amdgpu_vmhub *hub = &adev->vmhub[vmhub]; struct amdgpu_vmhub *hub = &adev->vmhub[vmhub];
const unsigned eng = 17; const unsigned eng = 17;
u32 inv_req = hub->vmhub_funcs->get_invalidate_req(vmid, flush_type); u32 inv_req = hub->vmhub_funcs->get_invalidate_req(vmid, flush_type);
...@@ -297,6 +296,8 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, ...@@ -297,6 +296,8 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req, amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
1 << vmid); 1 << vmid);
up_read(&adev->reset_sem);
return; return;
} }
......
...@@ -503,13 +503,14 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, ...@@ -503,13 +503,14 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
* as GFXOFF under bare metal * as GFXOFF under bare metal
*/ */
if (adev->gfx.kiq.ring.sched.ready && if (adev->gfx.kiq.ring.sched.ready &&
(amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) && (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
!amdgpu_in_reset(adev)) { down_read_trylock(&adev->reset_sem)) {
uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng; uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng;
uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng; uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng;
amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req, amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
1 << vmid); 1 << vmid);
up_read(&adev->reset_sem);
return; return;
} }
...@@ -602,7 +603,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev, ...@@ -602,7 +603,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
if (amdgpu_in_reset(adev)) if (amdgpu_in_reset(adev))
return -EIO; return -EIO;
if (ring->sched.ready) { if (ring->sched.ready && down_read_trylock(&adev->reset_sem)) {
/* Vega20+XGMI caches PTEs in TC and TLB. Add a /* Vega20+XGMI caches PTEs in TC and TLB. Add a
* heavy-weight TLB flush (type 2), which flushes * heavy-weight TLB flush (type 2), which flushes
* both. Due to a race condition with concurrent * both. Due to a race condition with concurrent
...@@ -629,6 +630,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev, ...@@ -629,6 +630,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
if (r) { if (r) {
amdgpu_ring_undo(ring); amdgpu_ring_undo(ring);
spin_unlock(&adev->gfx.kiq.ring_lock); spin_unlock(&adev->gfx.kiq.ring_lock);
up_read(&adev->reset_sem);
return -ETIME; return -ETIME;
} }
...@@ -637,9 +639,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev, ...@@ -637,9 +639,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout); r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
if (r < 1) { if (r < 1) {
dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r); dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
up_read(&adev->reset_sem);
return -ETIME; return -ETIME;
} }
up_read(&adev->reset_sem);
return 0; return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment