Commit 02866679 authored by Chris Wilson's avatar Chris Wilson

drm/i915/selftests: Avoid repeatedly harming the same innocent context

We don't handle resetting the kernel context very well, or presumably any
context executing its breadcrumb commands in the ring as opposed to the
batchbuffer and flush. If we trigger a device reset twice in quick
succession while the kernel context is executing, we may end up skipping
the breadcrumb.  This is really only a problem for the selftest as
normally there is a large interlude between resets (hangcheck), or we
focus on resetting just one engine and so avoid repeatedly resetting
innocents.

Something to try would be a preempt-to-idle to quiesce the engine before
reset, so that innocent contexts would be spared the reset.
Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Michał Winiarski <michal.winiarski@intel.com>
CC: Michel Thierry <michel.thierry@intel.com>
Reviewed-by: default avatarTvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180330131801.18327-1-chris@chris-wilson.co.uk
parent b651bd2a
...@@ -1886,6 +1886,8 @@ void i915_reset(struct drm_i915_private *i915) ...@@ -1886,6 +1886,8 @@ void i915_reset(struct drm_i915_private *i915)
int ret; int ret;
int i; int i;
GEM_TRACE("flags=%lx\n", error->flags);
might_sleep(); might_sleep();
lockdep_assert_held(&i915->drm.struct_mutex); lockdep_assert_held(&i915->drm.struct_mutex);
GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags)); GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
...@@ -2016,6 +2018,7 @@ int i915_reset_engine(struct intel_engine_cs *engine, const char *msg) ...@@ -2016,6 +2018,7 @@ int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
struct i915_request *active_request; struct i915_request *active_request;
int ret; int ret;
GEM_TRACE("%s flags=%lx\n", engine->name, error->flags);
GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags)); GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
active_request = i915_gem_reset_prepare_engine(engine); active_request = i915_gem_reset_prepare_engine(engine);
......
...@@ -979,6 +979,23 @@ static int igt_wait_reset(void *arg) ...@@ -979,6 +979,23 @@ static int igt_wait_reset(void *arg)
return err; return err;
} }
static int wait_for_others(struct drm_i915_private *i915,
struct intel_engine_cs *exclude)
{
struct intel_engine_cs *engine;
enum intel_engine_id id;
for_each_engine(engine, i915, id) {
if (engine == exclude)
continue;
if (wait_for(intel_engine_is_idle(engine), 10))
return -EIO;
}
return 0;
}
static int igt_reset_queue(void *arg) static int igt_reset_queue(void *arg)
{ {
struct drm_i915_private *i915 = arg; struct drm_i915_private *i915 = arg;
...@@ -1027,13 +1044,36 @@ static int igt_reset_queue(void *arg) ...@@ -1027,13 +1044,36 @@ static int igt_reset_queue(void *arg)
i915_request_get(rq); i915_request_get(rq);
__i915_request_add(rq, true); __i915_request_add(rq, true);
/*
* XXX We don't handle resetting the kernel context
* very well. If we trigger a device reset twice in
* quick succession while the kernel context is
* executing, we may end up skipping the breadcrumb.
* This is really only a problem for the selftest as
* normally there is a large interlude between resets
* (hangcheck), or we focus on resetting just one
* engine and so avoid repeatedly resetting innocents.
*/
err = wait_for_others(i915, engine);
if (err) {
pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
__func__, engine->name);
i915_request_put(rq);
i915_request_put(prev);
GEM_TRACE_DUMP();
i915_gem_set_wedged(i915);
goto fini;
}
if (!wait_for_hang(&h, prev)) { if (!wait_for_hang(&h, prev)) {
struct drm_printer p = drm_info_printer(i915->drm.dev); struct drm_printer p = drm_info_printer(i915->drm.dev);
pr_err("%s: Failed to start request %x, at %x\n", pr_err("%s(%s): Failed to start request %x, at %x\n",
__func__, prev->fence.seqno, hws_seqno(&h, prev)); __func__, engine->name,
intel_engine_dump(prev->engine, &p, prev->fence.seqno, hws_seqno(&h, prev));
"%s\n", prev->engine->name); intel_engine_dump(engine, &p,
"%s\n", engine->name);
i915_request_put(rq); i915_request_put(rq);
i915_request_put(prev); i915_request_put(prev);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment