Commit 05535726 authored by Chris Wilson's avatar Chris Wilson

drm/i915: Delay queuing hangcheck to wait-request

We can forgo queuing the hangcheck from the start of every request to
until we wait upon a request. This reduces the overhead of every
request, but may increase the latency of detecting a hang. However, if
nothing every waits upon a hang, did it ever hang? It also improves the
robustness of the wait-request by ensuring that the hangchecker is
indeed running before we sleep indefinitely (and thereby ensuring that
we never actually sleep forever waiting for a dead GPU).

As pointed out by Tvrtko, it is possible for a GPU hang to go unnoticed
for as long as nobody is waiting for the GPU. Though this rare, during
that time we may be consuming more power than if we had promptly
recovered, and in the most extreme case we may exhaust all memory before
forcing the hangcheck. Something to be wary off in future.
Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: default avatarTvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1467390209-3576-2-git-send-email-chris@chris-wilson.co.uk
parent bed50aea
...@@ -1532,6 +1532,15 @@ int __i915_wait_request(struct drm_i915_gem_request *req, ...@@ -1532,6 +1532,15 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
break; break;
} }
/* Ensure that even if the GPU hangs, we get woken up.
*
* However, note that if no one is waiting, we never notice
* a gpu hang. Eventually, we will have to wait for a resource
* held by the GPU and so trigger a hangcheck. In the most
* pathological case, this will be upon memory starvation!
*/
i915_queue_hangcheck(dev_priv);
timer.function = NULL; timer.function = NULL;
if (timeout || missed_irq(dev_priv, engine)) { if (timeout || missed_irq(dev_priv, engine)) {
unsigned long expire; unsigned long expire;
...@@ -2919,8 +2928,6 @@ void __i915_add_request(struct drm_i915_gem_request *request, ...@@ -2919,8 +2928,6 @@ void __i915_add_request(struct drm_i915_gem_request *request,
/* Not allowed to fail! */ /* Not allowed to fail! */
WARN(ret, "emit|add_request failed: %d!\n", ret); WARN(ret, "emit|add_request failed: %d!\n", ret);
i915_queue_hangcheck(engine->i915);
queue_delayed_work(dev_priv->wq, queue_delayed_work(dev_priv->wq,
&dev_priv->mm.retire_work, &dev_priv->mm.retire_work,
round_jiffies_up_relative(HZ)); round_jiffies_up_relative(HZ));
......
...@@ -3135,10 +3135,10 @@ static void i915_hangcheck_elapsed(struct work_struct *work) ...@@ -3135,10 +3135,10 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
intel_uncore_arm_unclaimed_mmio_detection(dev_priv); intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
for_each_engine_id(engine, dev_priv, id) { for_each_engine_id(engine, dev_priv, id) {
bool busy = waitqueue_active(&engine->irq_queue);
u64 acthd; u64 acthd;
u32 seqno; u32 seqno;
unsigned user_interrupts; unsigned user_interrupts;
bool busy = true;
semaphore_clear_deadlocks(dev_priv); semaphore_clear_deadlocks(dev_priv);
...@@ -3161,12 +3161,11 @@ static void i915_hangcheck_elapsed(struct work_struct *work) ...@@ -3161,12 +3161,11 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
if (engine->hangcheck.seqno == seqno) { if (engine->hangcheck.seqno == seqno) {
if (ring_idle(engine, seqno)) { if (ring_idle(engine, seqno)) {
engine->hangcheck.action = HANGCHECK_IDLE; engine->hangcheck.action = HANGCHECK_IDLE;
if (waitqueue_active(&engine->irq_queue)) { if (busy) {
/* Safeguard against driver failure */ /* Safeguard against driver failure */
user_interrupts = kick_waiters(engine); user_interrupts = kick_waiters(engine);
engine->hangcheck.score += BUSY; engine->hangcheck.score += BUSY;
} else }
busy = false;
} else { } else {
/* We always increment the hangcheck score /* We always increment the hangcheck score
* if the ring is busy and still processing * if the ring is busy and still processing
...@@ -3240,9 +3239,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work) ...@@ -3240,9 +3239,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
goto out; goto out;
} }
/* Reset timer in case GPU hangs without another request being added */
if (busy_count) if (busy_count)
/* Reset timer case chip hangs without another request
* being added */
i915_queue_hangcheck(dev_priv); i915_queue_hangcheck(dev_priv);
out: out:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment