Commit 3c2d6ea2 authored by Philip Yang's avatar Philip Yang Committed by Alex Deucher

drm/amdgpu: handle IH ring1 overflow

IH ring1 is used to process GPU retry fault, overflow is enabled to
drain retry fault because we want receive other interrupts while
handling retry fault to recover range. There is no overflow flag set
when wptr pass rptr. Use timestamp of rptr and wptr to handle overflow
and drain retry fault.

If fault timestamp goes backward, the fault is filtered and should not
be processed. Drain fault is finished if processed_timestamp is equal to
or larger than checkpoint timestamp.

Add amdgpu_ih_functions interface decode_iv_ts for different chips to
get timestamp from IV entry with different iv size and timestamp offset.
amdgpu_ih_decode_iv_ts_helper is used for vega10, vega20, navi10.
Signed-off-by: default avatarPhilip Yang <Philip.Yang@amd.com>
Reviewed-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Acked-by: default avatarChristian König <christian.koenig@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 232d1d43
...@@ -350,6 +350,7 @@ static inline uint64_t amdgpu_gmc_fault_key(uint64_t addr, uint16_t pasid) ...@@ -350,6 +350,7 @@ static inline uint64_t amdgpu_gmc_fault_key(uint64_t addr, uint16_t pasid)
* amdgpu_gmc_filter_faults - filter VM faults * amdgpu_gmc_filter_faults - filter VM faults
* *
* @adev: amdgpu device structure * @adev: amdgpu device structure
* @ih: interrupt ring that the fault received from
* @addr: address of the VM fault * @addr: address of the VM fault
* @pasid: PASID of the process causing the fault * @pasid: PASID of the process causing the fault
* @timestamp: timestamp of the fault * @timestamp: timestamp of the fault
...@@ -358,7 +359,8 @@ static inline uint64_t amdgpu_gmc_fault_key(uint64_t addr, uint16_t pasid) ...@@ -358,7 +359,8 @@ static inline uint64_t amdgpu_gmc_fault_key(uint64_t addr, uint16_t pasid)
* True if the fault was filtered and should not be processed further. * True if the fault was filtered and should not be processed further.
* False if the fault is a new one and needs to be handled. * False if the fault is a new one and needs to be handled.
*/ */
bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr, bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
struct amdgpu_ih_ring *ih, uint64_t addr,
uint16_t pasid, uint64_t timestamp) uint16_t pasid, uint64_t timestamp)
{ {
struct amdgpu_gmc *gmc = &adev->gmc; struct amdgpu_gmc *gmc = &adev->gmc;
...@@ -366,6 +368,10 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr, ...@@ -366,6 +368,10 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr,
struct amdgpu_gmc_fault *fault; struct amdgpu_gmc_fault *fault;
uint32_t hash; uint32_t hash;
/* Stale retry fault if timestamp goes backward */
if (amdgpu_ih_ts_after(timestamp, ih->processed_timestamp))
return true;
/* If we don't have space left in the ring buffer return immediately */ /* If we don't have space left in the ring buffer return immediately */
stamp = max(timestamp, AMDGPU_GMC_FAULT_TIMEOUT + 1) - stamp = max(timestamp, AMDGPU_GMC_FAULT_TIMEOUT + 1) -
AMDGPU_GMC_FAULT_TIMEOUT; AMDGPU_GMC_FAULT_TIMEOUT;
......
...@@ -316,7 +316,8 @@ void amdgpu_gmc_gart_location(struct amdgpu_device *adev, ...@@ -316,7 +316,8 @@ void amdgpu_gmc_gart_location(struct amdgpu_device *adev,
struct amdgpu_gmc *mc); struct amdgpu_gmc *mc);
void amdgpu_gmc_agp_location(struct amdgpu_device *adev, void amdgpu_gmc_agp_location(struct amdgpu_device *adev,
struct amdgpu_gmc *mc); struct amdgpu_gmc *mc);
bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t addr, bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
struct amdgpu_ih_ring *ih, uint64_t addr,
uint16_t pasid, uint64_t timestamp); uint16_t pasid, uint64_t timestamp);
void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr, void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr,
uint16_t pasid); uint16_t pasid);
......
...@@ -164,52 +164,32 @@ void amdgpu_ih_ring_write(struct amdgpu_ih_ring *ih, const uint32_t *iv, ...@@ -164,52 +164,32 @@ void amdgpu_ih_ring_write(struct amdgpu_ih_ring *ih, const uint32_t *iv,
} }
} }
/* Waiter helper that checks current rptr matches or passes checkpoint wptr */
static bool amdgpu_ih_has_checkpoint_processed(struct amdgpu_device *adev,
struct amdgpu_ih_ring *ih,
uint32_t checkpoint_wptr,
uint32_t *prev_rptr)
{
uint32_t cur_rptr = ih->rptr | (*prev_rptr & ~ih->ptr_mask);
/* rptr has wrapped. */
if (cur_rptr < *prev_rptr)
cur_rptr += ih->ptr_mask + 1;
*prev_rptr = cur_rptr;
/* check ring is empty to workaround missing wptr overflow flag */
return cur_rptr >= checkpoint_wptr ||
(cur_rptr & ih->ptr_mask) == amdgpu_ih_get_wptr(adev, ih);
}
/** /**
* amdgpu_ih_wait_on_checkpoint_process - wait to process IVs up to checkpoint * amdgpu_ih_wait_on_checkpoint_process_ts - wait to process IVs up to checkpoint
* *
* @adev: amdgpu_device pointer * @adev: amdgpu_device pointer
* @ih: ih ring to process * @ih: ih ring to process
* *
* Used to ensure ring has processed IVs up to the checkpoint write pointer. * Used to ensure ring has processed IVs up to the checkpoint write pointer.
*/ */
int amdgpu_ih_wait_on_checkpoint_process(struct amdgpu_device *adev, int amdgpu_ih_wait_on_checkpoint_process_ts(struct amdgpu_device *adev,
struct amdgpu_ih_ring *ih) struct amdgpu_ih_ring *ih)
{ {
uint32_t checkpoint_wptr, rptr; uint32_t checkpoint_wptr;
uint64_t checkpoint_ts;
long timeout = HZ;
if (!ih->enabled || adev->shutdown) if (!ih->enabled || adev->shutdown)
return -ENODEV; return -ENODEV;
checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih); checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih);
/* Order wptr with rptr. */ /* Order wptr with ring data. */
rmb(); rmb();
rptr = READ_ONCE(ih->rptr); checkpoint_ts = amdgpu_ih_decode_iv_ts(adev, ih, checkpoint_wptr, -1);
/* wptr has wrapped. */ return wait_event_interruptible_timeout(ih->wait_process,
if (rptr > checkpoint_wptr) !amdgpu_ih_ts_after(ih->processed_timestamp, checkpoint_ts),
checkpoint_wptr += ih->ptr_mask + 1; timeout);
return wait_event_interruptible(ih->wait_process,
amdgpu_ih_has_checkpoint_processed(adev, ih,
checkpoint_wptr, &rptr));
} }
/** /**
...@@ -299,3 +279,18 @@ void amdgpu_ih_decode_iv_helper(struct amdgpu_device *adev, ...@@ -299,3 +279,18 @@ void amdgpu_ih_decode_iv_helper(struct amdgpu_device *adev,
/* wptr/rptr are in bytes! */ /* wptr/rptr are in bytes! */
ih->rptr += 32; ih->rptr += 32;
} }
uint64_t amdgpu_ih_decode_iv_ts_helper(struct amdgpu_ih_ring *ih, u32 rptr,
signed int offset)
{
uint32_t iv_size = 32;
uint32_t ring_index;
uint32_t dw1, dw2;
rptr += iv_size * offset;
ring_index = (rptr & ih->ptr_mask) >> 2;
dw1 = le32_to_cpu(ih->ring[ring_index + 1]);
dw2 = le32_to_cpu(ih->ring[ring_index + 2]);
return dw1 | ((u64)(dw2 & 0xffff) << 32);
}
...@@ -68,20 +68,30 @@ struct amdgpu_ih_ring { ...@@ -68,20 +68,30 @@ struct amdgpu_ih_ring {
/* For waiting on IH processing at checkpoint. */ /* For waiting on IH processing at checkpoint. */
wait_queue_head_t wait_process; wait_queue_head_t wait_process;
uint64_t processed_timestamp;
}; };
/* return true if time stamp t2 is after t1 with 48bit wrap around */
#define amdgpu_ih_ts_after(t1, t2) \
(((int64_t)((t2) << 16) - (int64_t)((t1) << 16)) > 0LL)
/* provided by the ih block */ /* provided by the ih block */
struct amdgpu_ih_funcs { struct amdgpu_ih_funcs {
/* ring read/write ptr handling, called from interrupt context */ /* ring read/write ptr handling, called from interrupt context */
u32 (*get_wptr)(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih); u32 (*get_wptr)(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih);
void (*decode_iv)(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih, void (*decode_iv)(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih,
struct amdgpu_iv_entry *entry); struct amdgpu_iv_entry *entry);
uint64_t (*decode_iv_ts)(struct amdgpu_ih_ring *ih, u32 rptr,
signed int offset);
void (*set_rptr)(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih); void (*set_rptr)(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih);
}; };
#define amdgpu_ih_get_wptr(adev, ih) (adev)->irq.ih_funcs->get_wptr((adev), (ih)) #define amdgpu_ih_get_wptr(adev, ih) (adev)->irq.ih_funcs->get_wptr((adev), (ih))
#define amdgpu_ih_decode_iv(adev, iv) \ #define amdgpu_ih_decode_iv(adev, iv) \
(adev)->irq.ih_funcs->decode_iv((adev), (ih), (iv)) (adev)->irq.ih_funcs->decode_iv((adev), (ih), (iv))
#define amdgpu_ih_decode_iv_ts(adev, ih, rptr, offset) \
(WARN_ON_ONCE(!(adev)->irq.ih_funcs->decode_iv_ts) ? 0 : \
(adev)->irq.ih_funcs->decode_iv_ts((ih), (rptr), (offset)))
#define amdgpu_ih_set_rptr(adev, ih) (adev)->irq.ih_funcs->set_rptr((adev), (ih)) #define amdgpu_ih_set_rptr(adev, ih) (adev)->irq.ih_funcs->set_rptr((adev), (ih))
int amdgpu_ih_ring_init(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih, int amdgpu_ih_ring_init(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih,
...@@ -89,10 +99,12 @@ int amdgpu_ih_ring_init(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih, ...@@ -89,10 +99,12 @@ int amdgpu_ih_ring_init(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih,
void amdgpu_ih_ring_fini(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih); void amdgpu_ih_ring_fini(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih);
void amdgpu_ih_ring_write(struct amdgpu_ih_ring *ih, const uint32_t *iv, void amdgpu_ih_ring_write(struct amdgpu_ih_ring *ih, const uint32_t *iv,
unsigned int num_dw); unsigned int num_dw);
int amdgpu_ih_wait_on_checkpoint_process(struct amdgpu_device *adev, int amdgpu_ih_wait_on_checkpoint_process_ts(struct amdgpu_device *adev,
struct amdgpu_ih_ring *ih); struct amdgpu_ih_ring *ih);
int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih); int amdgpu_ih_process(struct amdgpu_device *adev, struct amdgpu_ih_ring *ih);
void amdgpu_ih_decode_iv_helper(struct amdgpu_device *adev, void amdgpu_ih_decode_iv_helper(struct amdgpu_device *adev,
struct amdgpu_ih_ring *ih, struct amdgpu_ih_ring *ih,
struct amdgpu_iv_entry *entry); struct amdgpu_iv_entry *entry);
uint64_t amdgpu_ih_decode_iv_ts_helper(struct amdgpu_ih_ring *ih, u32 rptr,
signed int offset);
#endif #endif
...@@ -528,6 +528,12 @@ void amdgpu_irq_dispatch(struct amdgpu_device *adev, ...@@ -528,6 +528,12 @@ void amdgpu_irq_dispatch(struct amdgpu_device *adev,
/* Send it to amdkfd as well if it isn't already handled */ /* Send it to amdkfd as well if it isn't already handled */
if (!handled) if (!handled)
amdgpu_amdkfd_interrupt(adev, entry.iv_entry); amdgpu_amdkfd_interrupt(adev, entry.iv_entry);
dev_WARN_ONCE(adev->dev, ih->processed_timestamp == entry.timestamp,
"IH timestamps are not unique");
if (amdgpu_ih_ts_after(ih->processed_timestamp, entry.timestamp))
ih->processed_timestamp = entry.timestamp;
} }
/** /**
......
...@@ -107,7 +107,7 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev, ...@@ -107,7 +107,7 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
/* Process it onyl if it's the first fault for this address */ /* Process it onyl if it's the first fault for this address */
if (entry->ih != &adev->irq.ih_soft && if (entry->ih != &adev->irq.ih_soft &&
amdgpu_gmc_filter_faults(adev, addr, entry->pasid, amdgpu_gmc_filter_faults(adev, entry->ih, addr, entry->pasid,
entry->timestamp)) entry->timestamp))
return 1; return 1;
......
...@@ -523,7 +523,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev, ...@@ -523,7 +523,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
/* Process it onyl if it's the first fault for this address */ /* Process it onyl if it's the first fault for this address */
if (entry->ih != &adev->irq.ih_soft && if (entry->ih != &adev->irq.ih_soft &&
amdgpu_gmc_filter_faults(adev, addr, entry->pasid, amdgpu_gmc_filter_faults(adev, entry->ih, addr, entry->pasid,
entry->timestamp)) entry->timestamp))
return 1; return 1;
......
...@@ -716,6 +716,7 @@ static const struct amd_ip_funcs navi10_ih_ip_funcs = { ...@@ -716,6 +716,7 @@ static const struct amd_ip_funcs navi10_ih_ip_funcs = {
static const struct amdgpu_ih_funcs navi10_ih_funcs = { static const struct amdgpu_ih_funcs navi10_ih_funcs = {
.get_wptr = navi10_ih_get_wptr, .get_wptr = navi10_ih_get_wptr,
.decode_iv = amdgpu_ih_decode_iv_helper, .decode_iv = amdgpu_ih_decode_iv_helper,
.decode_iv_ts = amdgpu_ih_decode_iv_ts_helper,
.set_rptr = navi10_ih_set_rptr .set_rptr = navi10_ih_set_rptr
}; };
......
...@@ -640,6 +640,7 @@ const struct amd_ip_funcs vega10_ih_ip_funcs = { ...@@ -640,6 +640,7 @@ const struct amd_ip_funcs vega10_ih_ip_funcs = {
static const struct amdgpu_ih_funcs vega10_ih_funcs = { static const struct amdgpu_ih_funcs vega10_ih_funcs = {
.get_wptr = vega10_ih_get_wptr, .get_wptr = vega10_ih_get_wptr,
.decode_iv = amdgpu_ih_decode_iv_helper, .decode_iv = amdgpu_ih_decode_iv_helper,
.decode_iv_ts = amdgpu_ih_decode_iv_ts_helper,
.set_rptr = vega10_ih_set_rptr .set_rptr = vega10_ih_set_rptr
}; };
......
...@@ -688,6 +688,7 @@ const struct amd_ip_funcs vega20_ih_ip_funcs = { ...@@ -688,6 +688,7 @@ const struct amd_ip_funcs vega20_ih_ip_funcs = {
static const struct amdgpu_ih_funcs vega20_ih_funcs = { static const struct amdgpu_ih_funcs vega20_ih_funcs = {
.get_wptr = vega20_ih_get_wptr, .get_wptr = vega20_ih_get_wptr,
.decode_iv = amdgpu_ih_decode_iv_helper, .decode_iv = amdgpu_ih_decode_iv_helper,
.decode_iv_ts = amdgpu_ih_decode_iv_ts_helper,
.set_rptr = vega20_ih_set_rptr .set_rptr = vega20_ih_set_rptr
}; };
......
...@@ -1974,7 +1974,7 @@ static void svm_range_drain_retry_fault(struct svm_range_list *svms) ...@@ -1974,7 +1974,7 @@ static void svm_range_drain_retry_fault(struct svm_range_list *svms)
pr_debug("drain retry fault gpu %d svms %p\n", i, svms); pr_debug("drain retry fault gpu %d svms %p\n", i, svms);
amdgpu_ih_wait_on_checkpoint_process(pdd->dev->adev, amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev,
&pdd->dev->adev->irq.ih1); &pdd->dev->adev->irq.ih1);
pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms); pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment