Commit 12471ba8 authored by Chris Wilson's avatar Chris Wilson

drm/i915: Harden detection of missed interrupts

Only declare a missed interrupt if we find that the GPU is idle with
waiters and a hangcheck interval has passed in which no new user
interrupts have been raised.

v2: Clear the stuck interrupt marker between successful batches
Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: default avatarMika Kuoppala <mika.kuoppala@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1460195877-20520-3-git-send-email-chris@chris-wilson.co.uk
parent c04e0f3b
...@@ -728,10 +728,10 @@ static int i915_gem_request_info(struct seq_file *m, void *data) ...@@ -728,10 +728,10 @@ static int i915_gem_request_info(struct seq_file *m, void *data)
static void i915_ring_seqno_info(struct seq_file *m, static void i915_ring_seqno_info(struct seq_file *m,
struct intel_engine_cs *engine) struct intel_engine_cs *engine)
{ {
if (engine->get_seqno) { seq_printf(m, "Current sequence (%s): %x\n",
seq_printf(m, "Current sequence (%s): %x\n", engine->name, engine->get_seqno(engine));
engine->name, engine->get_seqno(engine)); seq_printf(m, "Current user interrupts (%s): %x\n",
} engine->name, READ_ONCE(engine->user_interrupts));
} }
static int i915_gem_seqno_info(struct seq_file *m, void *data) static int i915_gem_seqno_info(struct seq_file *m, void *data)
...@@ -1367,6 +1367,9 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused) ...@@ -1367,6 +1367,9 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
engine->hangcheck.seqno, engine->hangcheck.seqno,
seqno[id], seqno[id],
engine->last_submitted_seqno); engine->last_submitted_seqno);
seq_printf(m, "\tuser interrupts = %x [current %x]\n",
engine->hangcheck.user_interrupts,
READ_ONCE(engine->user_interrupts));
seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n", seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
(long long)engine->hangcheck.acthd, (long long)engine->hangcheck.acthd,
(long long)acthd[id]); (long long)acthd[id]);
......
...@@ -1000,6 +1000,7 @@ static void notify_ring(struct intel_engine_cs *engine) ...@@ -1000,6 +1000,7 @@ static void notify_ring(struct intel_engine_cs *engine)
return; return;
trace_i915_gem_request_notify(engine); trace_i915_gem_request_notify(engine);
engine->user_interrupts++;
wake_up_all(&engine->irq_queue); wake_up_all(&engine->irq_queue);
} }
...@@ -3054,6 +3055,24 @@ ring_stuck(struct intel_engine_cs *engine, u64 acthd) ...@@ -3054,6 +3055,24 @@ ring_stuck(struct intel_engine_cs *engine, u64 acthd)
return HANGCHECK_HUNG; return HANGCHECK_HUNG;
} }
static unsigned kick_waiters(struct intel_engine_cs *engine)
{
struct drm_i915_private *i915 = to_i915(engine->dev);
unsigned user_interrupts = READ_ONCE(engine->user_interrupts);
if (engine->hangcheck.user_interrupts == user_interrupts &&
!test_and_set_bit(engine->id, &i915->gpu_error.missed_irq_rings)) {
if (!(i915->gpu_error.test_irq_rings & intel_engine_flag(engine)))
DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
engine->name);
else
DRM_INFO("Fake missed irq on %s\n",
engine->name);
wake_up_all(&engine->irq_queue);
}
return user_interrupts;
}
/* /*
* This is called when the chip hasn't reported back with completed * This is called when the chip hasn't reported back with completed
* batchbuffers in a long time. We keep track per ring seqno progress and * batchbuffers in a long time. We keep track per ring seqno progress and
...@@ -3096,6 +3115,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work) ...@@ -3096,6 +3115,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
for_each_engine_id(engine, dev_priv, id) { for_each_engine_id(engine, dev_priv, id) {
u64 acthd; u64 acthd;
u32 seqno; u32 seqno;
unsigned user_interrupts;
bool busy = true; bool busy = true;
semaphore_clear_deadlocks(dev_priv); semaphore_clear_deadlocks(dev_priv);
...@@ -3113,22 +3133,15 @@ static void i915_hangcheck_elapsed(struct work_struct *work) ...@@ -3113,22 +3133,15 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
acthd = intel_ring_get_active_head(engine); acthd = intel_ring_get_active_head(engine);
seqno = engine->get_seqno(engine); seqno = engine->get_seqno(engine);
/* Reset stuck interrupts between batch advances */
user_interrupts = 0;
if (engine->hangcheck.seqno == seqno) { if (engine->hangcheck.seqno == seqno) {
if (ring_idle(engine, seqno)) { if (ring_idle(engine, seqno)) {
engine->hangcheck.action = HANGCHECK_IDLE; engine->hangcheck.action = HANGCHECK_IDLE;
if (waitqueue_active(&engine->irq_queue)) { if (waitqueue_active(&engine->irq_queue)) {
/* Issue a wake-up to catch stuck h/w. */
if (!test_and_set_bit(engine->id, &dev_priv->gpu_error.missed_irq_rings)) {
if (!(dev_priv->gpu_error.test_irq_rings & intel_engine_flag(engine)))
DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
engine->name);
else
DRM_INFO("Fake missed irq on %s\n",
engine->name);
wake_up_all(&engine->irq_queue);
}
/* Safeguard against driver failure */ /* Safeguard against driver failure */
user_interrupts = kick_waiters(engine);
engine->hangcheck.score += BUSY; engine->hangcheck.score += BUSY;
} else } else
busy = false; busy = false;
...@@ -3179,7 +3192,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work) ...@@ -3179,7 +3192,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
engine->hangcheck.score = 0; engine->hangcheck.score = 0;
/* Clear head and subunit states on seqno movement */ /* Clear head and subunit states on seqno movement */
engine->hangcheck.acthd = 0; acthd = 0;
memset(engine->hangcheck.instdone, 0, memset(engine->hangcheck.instdone, 0,
sizeof(engine->hangcheck.instdone)); sizeof(engine->hangcheck.instdone));
...@@ -3187,6 +3200,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work) ...@@ -3187,6 +3200,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
engine->hangcheck.seqno = seqno; engine->hangcheck.seqno = seqno;
engine->hangcheck.acthd = acthd; engine->hangcheck.acthd = acthd;
engine->hangcheck.user_interrupts = user_interrupts;
busy_count += busy; busy_count += busy;
} }
......
...@@ -87,6 +87,7 @@ enum intel_ring_hangcheck_action { ...@@ -87,6 +87,7 @@ enum intel_ring_hangcheck_action {
struct intel_ring_hangcheck { struct intel_ring_hangcheck {
u64 acthd; u64 acthd;
u32 seqno; u32 seqno;
unsigned user_interrupts;
int score; int score;
enum intel_ring_hangcheck_action action; enum intel_ring_hangcheck_action action;
int deadlock; int deadlock;
...@@ -305,6 +306,7 @@ struct intel_engine_cs { ...@@ -305,6 +306,7 @@ struct intel_engine_cs {
* inspecting request list. * inspecting request list.
*/ */
u32 last_submitted_seqno; u32 last_submitted_seqno;
unsigned user_interrupts;
bool gpu_caches_dirty; bool gpu_caches_dirty;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment