Commit e275d61c authored by Matthew Brost's avatar Matthew Brost

drm/xe/guc: Handle timing out of signaled jobs gracefully

Timing out of signaled jobs can happen during regular operations (e.g.
an exec queue closed immediately after last fence signaled). The TDR can
pass the worker which free jobs. Rather than running through the TDR if
signaled job is found, simply free it without any debug messages.

Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Reported-by: default avatarJosé Roberto de Souza <jose.souza@intel.com>
Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/1271Signed-off-by: default avatarMatthew Brost <matthew.brost@intel.com>
Reviewed-by: default avatarThomas Hellström <thomas.hellstrom@linux.intel.com>
Tested-by: default avatarJosé Roberto de Souza <jose.souza@intel.com>
Reviewed-by: default avatarJosé Roberto de Souza <jose.souza@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240223204659.40750-1-matthew.brost@intel.com
parent ba6bbdc6
...@@ -929,7 +929,16 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) ...@@ -929,7 +929,16 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
int err = -ETIME; int err = -ETIME;
int i = 0; int i = 0;
if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &job->fence->flags)) { /*
* TDR has fired before free job worker. Common if exec queue
* immediately closed after last fence signaled.
*/
if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &job->fence->flags)) {
guc_exec_queue_free_job(drm_job);
return DRM_GPU_SCHED_STAT_NOMINAL;
}
drm_notice(&xe->drm, "Timedout job: seqno=%u, guc_id=%d, flags=0x%lx", drm_notice(&xe->drm, "Timedout job: seqno=%u, guc_id=%d, flags=0x%lx",
xe_sched_job_seqno(job), q->guc->id, q->flags); xe_sched_job_seqno(job), q->guc->id, q->flags);
xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_KERNEL, xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_KERNEL,
...@@ -939,10 +948,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) ...@@ -939,10 +948,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
simple_error_capture(q); simple_error_capture(q);
xe_devcoredump(job); xe_devcoredump(job);
} else {
drm_dbg(&xe->drm, "Timedout signaled job: seqno=%u, guc_id=%d, flags=0x%lx",
xe_sched_job_seqno(job), q->guc->id, q->flags);
}
trace_xe_sched_job_timedout(job); trace_xe_sched_job_timedout(job);
/* Kill the run_job entry point */ /* Kill the run_job entry point */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment