Commit 72eb16df authored by Chris Wilson's avatar Chris Wilson

drm/i915: Serialise resets with wedging

Prevent concurrent set-wedge with ongoing resets (and vice versa) by
taking the same wedge_mutex around both operations.
Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: default avatarMika Kuoppala <mika.kuoppala@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190208153708.20023-6-chris@chris-wilson.co.uk
parent 15cbf007
...@@ -794,17 +794,14 @@ static void nop_submit_request(struct i915_request *request) ...@@ -794,17 +794,14 @@ static void nop_submit_request(struct i915_request *request)
intel_engine_queue_breadcrumbs(engine); intel_engine_queue_breadcrumbs(engine);
} }
void i915_gem_set_wedged(struct drm_i915_private *i915) static void __i915_gem_set_wedged(struct drm_i915_private *i915)
{ {
struct i915_gpu_error *error = &i915->gpu_error; struct i915_gpu_error *error = &i915->gpu_error;
struct intel_engine_cs *engine; struct intel_engine_cs *engine;
enum intel_engine_id id; enum intel_engine_id id;
mutex_lock(&error->wedge_mutex); if (test_bit(I915_WEDGED, &error->flags))
if (test_bit(I915_WEDGED, &error->flags)) {
mutex_unlock(&error->wedge_mutex);
return; return;
}
if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(i915)) { if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(i915)) {
struct drm_printer p = drm_debug_printer(__func__); struct drm_printer p = drm_debug_printer(__func__);
...@@ -853,12 +850,18 @@ void i915_gem_set_wedged(struct drm_i915_private *i915) ...@@ -853,12 +850,18 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
set_bit(I915_WEDGED, &error->flags); set_bit(I915_WEDGED, &error->flags);
GEM_TRACE("end\n"); GEM_TRACE("end\n");
mutex_unlock(&error->wedge_mutex); }
wake_up_all(&error->reset_queue); void i915_gem_set_wedged(struct drm_i915_private *i915)
{
struct i915_gpu_error *error = &i915->gpu_error;
mutex_lock(&error->wedge_mutex);
__i915_gem_set_wedged(i915);
mutex_unlock(&error->wedge_mutex);
} }
bool i915_gem_unset_wedged(struct drm_i915_private *i915) static bool __i915_gem_unset_wedged(struct drm_i915_private *i915)
{ {
struct i915_gpu_error *error = &i915->gpu_error; struct i915_gpu_error *error = &i915->gpu_error;
struct i915_timeline *tl; struct i915_timeline *tl;
...@@ -869,8 +872,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915) ...@@ -869,8 +872,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
if (!i915->gt.scratch) /* Never full initialised, recovery impossible */ if (!i915->gt.scratch) /* Never full initialised, recovery impossible */
return false; return false;
mutex_lock(&error->wedge_mutex);
GEM_TRACE("start\n"); GEM_TRACE("start\n");
/* /*
...@@ -921,11 +922,21 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915) ...@@ -921,11 +922,21 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
smp_mb__before_atomic(); /* complete takeover before enabling execbuf */ smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
clear_bit(I915_WEDGED, &i915->gpu_error.flags); clear_bit(I915_WEDGED, &i915->gpu_error.flags);
mutex_unlock(&i915->gpu_error.wedge_mutex);
return true; return true;
} }
bool i915_gem_unset_wedged(struct drm_i915_private *i915)
{
struct i915_gpu_error *error = &i915->gpu_error;
bool result;
mutex_lock(&error->wedge_mutex);
result = __i915_gem_unset_wedged(i915);
mutex_unlock(&error->wedge_mutex);
return result;
}
static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask) static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
{ {
int err, i; int err, i;
...@@ -975,7 +986,7 @@ void i915_reset(struct drm_i915_private *i915, ...@@ -975,7 +986,7 @@ void i915_reset(struct drm_i915_private *i915,
GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags)); GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
/* Clear any previous failed attempts at recovery. Time to try again. */ /* Clear any previous failed attempts at recovery. Time to try again. */
if (!i915_gem_unset_wedged(i915)) if (!__i915_gem_unset_wedged(i915))
return; return;
if (reason) if (reason)
...@@ -1037,7 +1048,7 @@ void i915_reset(struct drm_i915_private *i915, ...@@ -1037,7 +1048,7 @@ void i915_reset(struct drm_i915_private *i915,
*/ */
add_taint(TAINT_WARN, LOCKDEP_STILL_OK); add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
error: error:
i915_gem_set_wedged(i915); __i915_gem_set_wedged(i915);
goto finish; goto finish;
} }
...@@ -1129,7 +1140,9 @@ static void i915_reset_device(struct drm_i915_private *i915, ...@@ -1129,7 +1140,9 @@ static void i915_reset_device(struct drm_i915_private *i915,
i915_wedge_on_timeout(&w, i915, 5 * HZ) { i915_wedge_on_timeout(&w, i915, 5 * HZ) {
intel_prepare_reset(i915); intel_prepare_reset(i915);
mutex_lock(&error->wedge_mutex);
i915_reset(i915, engine_mask, reason); i915_reset(i915, engine_mask, reason);
mutex_unlock(&error->wedge_mutex);
intel_finish_reset(i915); intel_finish_reset(i915);
} }
...@@ -1197,6 +1210,7 @@ void i915_handle_error(struct drm_i915_private *i915, ...@@ -1197,6 +1210,7 @@ void i915_handle_error(struct drm_i915_private *i915,
unsigned long flags, unsigned long flags,
const char *fmt, ...) const char *fmt, ...)
{ {
struct i915_gpu_error *error = &i915->gpu_error;
struct intel_engine_cs *engine; struct intel_engine_cs *engine;
intel_wakeref_t wakeref; intel_wakeref_t wakeref;
unsigned int tmp; unsigned int tmp;
...@@ -1233,20 +1247,19 @@ void i915_handle_error(struct drm_i915_private *i915, ...@@ -1233,20 +1247,19 @@ void i915_handle_error(struct drm_i915_private *i915,
* Try engine reset when available. We fall back to full reset if * Try engine reset when available. We fall back to full reset if
* single reset fails. * single reset fails.
*/ */
if (intel_has_reset_engine(i915) && if (intel_has_reset_engine(i915) && !i915_terminally_wedged(error)) {
!i915_terminally_wedged(&i915->gpu_error)) {
for_each_engine_masked(engine, i915, engine_mask, tmp) { for_each_engine_masked(engine, i915, engine_mask, tmp) {
BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE); BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
if (test_and_set_bit(I915_RESET_ENGINE + engine->id, if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
&i915->gpu_error.flags)) &error->flags))
continue; continue;
if (i915_reset_engine(engine, msg) == 0) if (i915_reset_engine(engine, msg) == 0)
engine_mask &= ~intel_engine_flag(engine); engine_mask &= ~intel_engine_flag(engine);
clear_bit(I915_RESET_ENGINE + engine->id, clear_bit(I915_RESET_ENGINE + engine->id,
&i915->gpu_error.flags); &error->flags);
wake_up_bit(&i915->gpu_error.flags, wake_up_bit(&error->flags,
I915_RESET_ENGINE + engine->id); I915_RESET_ENGINE + engine->id);
} }
} }
...@@ -1255,10 +1268,9 @@ void i915_handle_error(struct drm_i915_private *i915, ...@@ -1255,10 +1268,9 @@ void i915_handle_error(struct drm_i915_private *i915,
goto out; goto out;
/* Full reset needs the mutex, stop any other user trying to do so. */ /* Full reset needs the mutex, stop any other user trying to do so. */
if (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags)) { if (test_and_set_bit(I915_RESET_BACKOFF, &error->flags)) {
wait_event(i915->gpu_error.reset_queue, wait_event(error->reset_queue,
!test_bit(I915_RESET_BACKOFF, !test_bit(I915_RESET_BACKOFF, &error->flags));
&i915->gpu_error.flags));
goto out; /* piggy-back on the other reset */ goto out; /* piggy-back on the other reset */
} }
...@@ -1268,8 +1280,8 @@ void i915_handle_error(struct drm_i915_private *i915, ...@@ -1268,8 +1280,8 @@ void i915_handle_error(struct drm_i915_private *i915,
/* Prevent any other reset-engine attempt. */ /* Prevent any other reset-engine attempt. */
for_each_engine(engine, i915, tmp) { for_each_engine(engine, i915, tmp) {
while (test_and_set_bit(I915_RESET_ENGINE + engine->id, while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
&i915->gpu_error.flags)) &error->flags))
wait_on_bit(&i915->gpu_error.flags, wait_on_bit(&error->flags,
I915_RESET_ENGINE + engine->id, I915_RESET_ENGINE + engine->id,
TASK_UNINTERRUPTIBLE); TASK_UNINTERRUPTIBLE);
} }
...@@ -1278,11 +1290,11 @@ void i915_handle_error(struct drm_i915_private *i915, ...@@ -1278,11 +1290,11 @@ void i915_handle_error(struct drm_i915_private *i915,
for_each_engine(engine, i915, tmp) { for_each_engine(engine, i915, tmp) {
clear_bit(I915_RESET_ENGINE + engine->id, clear_bit(I915_RESET_ENGINE + engine->id,
&i915->gpu_error.flags); &error->flags);
} }
clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags); clear_bit(I915_RESET_BACKOFF, &error->flags);
wake_up_all(&i915->gpu_error.reset_queue); wake_up_all(&error->reset_queue);
out: out:
intel_runtime_pm_put(i915, wakeref); intel_runtime_pm_put(i915, wakeref);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment