Commit 8c185eca authored by Chris Wilson's avatar Chris Wilson

drm/i915: Split I915_RESET_IN_PROGRESS into two flags

I915_RESET_IN_PROGRESS is being used for both signaling the requirement
to i915_mutex_lock_interruptible() to avoid taking the struct_mutex and
to instruct a waiter (already holding the struct_mutex) to perform the
reset. To allow for a little more coordination, split these two meaning
into a couple of distinct flags. I915_RESET_BACKOFF tells
i915_mutex_lock_interruptible() not to acquire the mutex and
I915_RESET_HANDOFF tells the waiter to call i915_reset().
Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Acked-by: default avatarMichel Thierry <michel.thierry@intel.com>
Reviewed-by: default avatarMika Kuoppala <mika.kuoppala@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20170316171305.12972-1-chris@chris-wilson.co.uk
parent 3fc03069
...@@ -1305,16 +1305,18 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused) ...@@ -1305,16 +1305,18 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
enum intel_engine_id id; enum intel_engine_id id;
if (test_bit(I915_WEDGED, &dev_priv->gpu_error.flags)) if (test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
seq_printf(m, "Wedged\n"); seq_puts(m, "Wedged\n");
if (test_bit(I915_RESET_IN_PROGRESS, &dev_priv->gpu_error.flags)) if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags))
seq_printf(m, "Reset in progress\n"); seq_puts(m, "Reset in progress: struct_mutex backoff\n");
if (test_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags))
seq_puts(m, "Reset in progress: reset handoff to waiter\n");
if (waitqueue_active(&dev_priv->gpu_error.wait_queue)) if (waitqueue_active(&dev_priv->gpu_error.wait_queue))
seq_printf(m, "Waiter holding struct mutex\n"); seq_puts(m, "Waiter holding struct mutex\n");
if (waitqueue_active(&dev_priv->gpu_error.reset_queue)) if (waitqueue_active(&dev_priv->gpu_error.reset_queue))
seq_printf(m, "struct_mutex blocked for reset\n"); seq_puts(m, "struct_mutex blocked for reset\n");
if (!i915.enable_hangcheck) { if (!i915.enable_hangcheck) {
seq_printf(m, "Hangcheck disabled\n"); seq_puts(m, "Hangcheck disabled\n");
return 0; return 0;
} }
...@@ -4127,7 +4129,7 @@ i915_wedged_set(void *data, u64 val) ...@@ -4127,7 +4129,7 @@ i915_wedged_set(void *data, u64 val)
* while it is writing to 'i915_wedged' * while it is writing to 'i915_wedged'
*/ */
if (i915_reset_in_progress(&dev_priv->gpu_error)) if (i915_reset_backoff(&dev_priv->gpu_error))
return -EAGAIN; return -EAGAIN;
i915_handle_error(dev_priv, val, i915_handle_error(dev_priv, val,
......
...@@ -1815,8 +1815,9 @@ void i915_reset(struct drm_i915_private *dev_priv) ...@@ -1815,8 +1815,9 @@ void i915_reset(struct drm_i915_private *dev_priv)
int ret; int ret;
lockdep_assert_held(&dev_priv->drm.struct_mutex); lockdep_assert_held(&dev_priv->drm.struct_mutex);
GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
if (!test_and_clear_bit(I915_RESET_IN_PROGRESS, &error->flags)) if (!test_bit(I915_RESET_HANDOFF, &error->flags))
return; return;
/* Clear any previous failed attempts at recovery. Time to try again. */ /* Clear any previous failed attempts at recovery. Time to try again. */
...@@ -1869,7 +1870,9 @@ void i915_reset(struct drm_i915_private *dev_priv) ...@@ -1869,7 +1870,9 @@ void i915_reset(struct drm_i915_private *dev_priv)
wakeup: wakeup:
i915_gem_reset_finish(dev_priv); i915_gem_reset_finish(dev_priv);
enable_irq(dev_priv->drm.irq); enable_irq(dev_priv->drm.irq);
wake_up_bit(&error->flags, I915_RESET_IN_PROGRESS);
clear_bit(I915_RESET_HANDOFF, &error->flags);
wake_up_bit(&error->flags, I915_RESET_HANDOFF);
return; return;
error: error:
......
...@@ -1595,8 +1595,33 @@ struct i915_gpu_error { ...@@ -1595,8 +1595,33 @@ struct i915_gpu_error {
*/ */
unsigned long reset_count; unsigned long reset_count;
/**
* flags: Control various stages of the GPU reset
*
* #I915_RESET_BACKOFF - When we start a reset, we want to stop any
* other users acquiring the struct_mutex. To do this we set the
* #I915_RESET_BACKOFF bit in the error flags when we detect a reset
* and then check for that bit before acquiring the struct_mutex (in
* i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
* secondary role in preventing two concurrent global reset attempts.
*
* #I915_RESET_HANDOFF - To perform the actual GPU reset, we need the
* struct_mutex. We try to acquire the struct_mutex in the reset worker,
* but it may be held by some long running waiter (that we cannot
* interrupt without causing trouble). Once we are ready to do the GPU
* reset, we set the I915_RESET_HANDOFF bit and wakeup any waiters. If
* they already hold the struct_mutex and want to participate they can
* inspect the bit and do the reset directly, otherwise the worker
* waits for the struct_mutex.
*
* #I915_WEDGED - If reset fails and we can no longer use the GPU,
* we set the #I915_WEDGED bit. Prior to command submission, e.g.
* i915_gem_request_alloc(), this bit is checked and the sequence
* aborted (with -EIO reported to userspace) if set.
*/
unsigned long flags; unsigned long flags;
#define I915_RESET_IN_PROGRESS 0 #define I915_RESET_BACKOFF 0
#define I915_RESET_HANDOFF 1
#define I915_WEDGED (BITS_PER_LONG - 1) #define I915_WEDGED (BITS_PER_LONG - 1)
/** /**
...@@ -3387,9 +3412,14 @@ i915_gem_find_active_request(struct intel_engine_cs *engine); ...@@ -3387,9 +3412,14 @@ i915_gem_find_active_request(struct intel_engine_cs *engine);
void i915_gem_retire_requests(struct drm_i915_private *dev_priv); void i915_gem_retire_requests(struct drm_i915_private *dev_priv);
static inline bool i915_reset_in_progress(struct i915_gpu_error *error) static inline bool i915_reset_backoff(struct i915_gpu_error *error)
{
return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags));
}
static inline bool i915_reset_handoff(struct i915_gpu_error *error)
{ {
return unlikely(test_bit(I915_RESET_IN_PROGRESS, &error->flags)); return unlikely(test_bit(I915_RESET_HANDOFF, &error->flags));
} }
static inline bool i915_terminally_wedged(struct i915_gpu_error *error) static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
...@@ -3397,9 +3427,9 @@ static inline bool i915_terminally_wedged(struct i915_gpu_error *error) ...@@ -3397,9 +3427,9 @@ static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
return unlikely(test_bit(I915_WEDGED, &error->flags)); return unlikely(test_bit(I915_WEDGED, &error->flags));
} }
static inline bool i915_reset_in_progress_or_wedged(struct i915_gpu_error *error) static inline bool i915_reset_backoff_or_wedged(struct i915_gpu_error *error)
{ {
return i915_reset_in_progress(error) | i915_terminally_wedged(error); return i915_reset_backoff(error) | i915_terminally_wedged(error);
} }
static inline u32 i915_reset_count(struct i915_gpu_error *error) static inline u32 i915_reset_count(struct i915_gpu_error *error)
......
...@@ -103,16 +103,13 @@ i915_gem_wait_for_error(struct i915_gpu_error *error) ...@@ -103,16 +103,13 @@ i915_gem_wait_for_error(struct i915_gpu_error *error)
might_sleep(); might_sleep();
if (!i915_reset_in_progress(error))
return 0;
/* /*
* Only wait 10 seconds for the gpu reset to complete to avoid hanging * Only wait 10 seconds for the gpu reset to complete to avoid hanging
* userspace. If it takes that long something really bad is going on and * userspace. If it takes that long something really bad is going on and
* we should simply try to bail out and fail as gracefully as possible. * we should simply try to bail out and fail as gracefully as possible.
*/ */
ret = wait_event_interruptible_timeout(error->reset_queue, ret = wait_event_interruptible_timeout(error->reset_queue,
!i915_reset_in_progress(error), !i915_reset_backoff(error),
I915_RESET_TIMEOUT); I915_RESET_TIMEOUT);
if (ret == 0) { if (ret == 0) {
DRM_ERROR("Timed out waiting for the gpu reset to complete\n"); DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
......
...@@ -1012,7 +1012,7 @@ bool __i915_spin_request(const struct drm_i915_gem_request *req, ...@@ -1012,7 +1012,7 @@ bool __i915_spin_request(const struct drm_i915_gem_request *req,
static bool __i915_wait_request_check_and_reset(struct drm_i915_gem_request *request) static bool __i915_wait_request_check_and_reset(struct drm_i915_gem_request *request)
{ {
if (likely(!i915_reset_in_progress(&request->i915->gpu_error))) if (likely(!i915_reset_handoff(&request->i915->gpu_error)))
return false; return false;
__set_current_state(TASK_RUNNING); __set_current_state(TASK_RUNNING);
......
...@@ -2631,22 +2631,6 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg) ...@@ -2631,22 +2631,6 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
return ret; return ret;
} }
static void i915_error_wake_up(struct drm_i915_private *dev_priv)
{
/*
* Notify all waiters for GPU completion events that reset state has
* been changed, and that they need to restart their wait after
* checking for potential errors (and bail out to drop locks if there is
* a gpu reset pending so that i915_error_work_func can acquire them).
*/
/* Wake up __wait_seqno, potentially holding dev->struct_mutex. */
wake_up_all(&dev_priv->gpu_error.wait_queue);
/* Wake up intel_crtc_wait_for_pending_flips, holding crtc->mutex. */
wake_up_all(&dev_priv->pending_flip_queue);
}
/** /**
* i915_reset_and_wakeup - do process context error handling work * i915_reset_and_wakeup - do process context error handling work
* @dev_priv: i915 device private * @dev_priv: i915 device private
...@@ -2668,6 +2652,9 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv) ...@@ -2668,6 +2652,9 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
intel_prepare_reset(dev_priv); intel_prepare_reset(dev_priv);
set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags);
wake_up_all(&dev_priv->gpu_error.wait_queue);
do { do {
/* /*
* All state reset _must_ be completed before we update the * All state reset _must_ be completed before we update the
...@@ -2682,7 +2669,7 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv) ...@@ -2682,7 +2669,7 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
/* We need to wait for anyone holding the lock to wakeup */ /* We need to wait for anyone holding the lock to wakeup */
} while (wait_on_bit_timeout(&dev_priv->gpu_error.flags, } while (wait_on_bit_timeout(&dev_priv->gpu_error.flags,
I915_RESET_IN_PROGRESS, I915_RESET_HANDOFF,
TASK_UNINTERRUPTIBLE, TASK_UNINTERRUPTIBLE,
HZ)); HZ));
...@@ -2696,6 +2683,7 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv) ...@@ -2696,6 +2683,7 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
* Note: The wake_up also serves as a memory barrier so that * Note: The wake_up also serves as a memory barrier so that
* waiters see the updated value of the dev_priv->gpu_error. * waiters see the updated value of the dev_priv->gpu_error.
*/ */
clear_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags);
wake_up_all(&dev_priv->gpu_error.reset_queue); wake_up_all(&dev_priv->gpu_error.reset_queue);
} }
...@@ -2788,24 +2776,10 @@ void i915_handle_error(struct drm_i915_private *dev_priv, ...@@ -2788,24 +2776,10 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
if (!engine_mask) if (!engine_mask)
goto out; goto out;
if (test_and_set_bit(I915_RESET_IN_PROGRESS, if (test_and_set_bit(I915_RESET_BACKOFF,
&dev_priv->gpu_error.flags)) &dev_priv->gpu_error.flags))
goto out; goto out;
/*
* Wakeup waiting processes so that the reset function
* i915_reset_and_wakeup doesn't deadlock trying to grab
* various locks. By bumping the reset counter first, the woken
* processes will see a reset in progress and back off,
* releasing their locks and then wait for the reset completion.
* We must do this for _all_ gpu waiters that might hold locks
* that the reset work needs to acquire.
*
* Note: The wake_up also provides a memory barrier to ensure that the
* waiters see the updated value of the reset flags.
*/
i915_error_wake_up(dev_priv);
i915_reset_and_wakeup(dev_priv); i915_reset_and_wakeup(dev_priv);
out: out:
......
...@@ -3639,7 +3639,7 @@ static bool abort_flip_on_reset(struct intel_crtc *crtc) ...@@ -3639,7 +3639,7 @@ static bool abort_flip_on_reset(struct intel_crtc *crtc)
{ {
struct i915_gpu_error *error = &to_i915(crtc->base.dev)->gpu_error; struct i915_gpu_error *error = &to_i915(crtc->base.dev)->gpu_error;
if (i915_reset_in_progress(error)) if (i915_reset_backoff(error))
return true; return true;
if (crtc->reset_count != i915_reset_count(error)) if (crtc->reset_count != i915_reset_count(error))
...@@ -10595,7 +10595,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc, ...@@ -10595,7 +10595,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
goto cleanup; goto cleanup;
intel_crtc->reset_count = i915_reset_count(&dev_priv->gpu_error); intel_crtc->reset_count = i915_reset_count(&dev_priv->gpu_error);
if (i915_reset_in_progress_or_wedged(&dev_priv->gpu_error)) { if (i915_reset_backoff_or_wedged(&dev_priv->gpu_error)) {
ret = -EIO; ret = -EIO;
goto unlock; goto unlock;
} }
......
...@@ -301,7 +301,8 @@ static int igt_global_reset(void *arg) ...@@ -301,7 +301,8 @@ static int igt_global_reset(void *arg)
/* Check that we can issue a global GPU reset */ /* Check that we can issue a global GPU reset */
set_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags); set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
mutex_lock(&i915->drm.struct_mutex); mutex_lock(&i915->drm.struct_mutex);
reset_count = i915_reset_count(&i915->gpu_error); reset_count = i915_reset_count(&i915->gpu_error);
...@@ -314,7 +315,8 @@ static int igt_global_reset(void *arg) ...@@ -314,7 +315,8 @@ static int igt_global_reset(void *arg)
} }
mutex_unlock(&i915->drm.struct_mutex); mutex_unlock(&i915->drm.struct_mutex);
GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags)); GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
if (i915_terminally_wedged(&i915->gpu_error)) if (i915_terminally_wedged(&i915->gpu_error))
err = -EIO; err = -EIO;
...@@ -330,7 +332,7 @@ static u32 fake_hangcheck(struct drm_i915_gem_request *rq) ...@@ -330,7 +332,7 @@ static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
reset_count = i915_reset_count(&rq->i915->gpu_error); reset_count = i915_reset_count(&rq->i915->gpu_error);
set_bit(I915_RESET_IN_PROGRESS, &rq->i915->gpu_error.flags); set_bit(I915_RESET_HANDOFF, &rq->i915->gpu_error.flags);
wake_up_all(&rq->i915->gpu_error.wait_queue); wake_up_all(&rq->i915->gpu_error.wait_queue);
return reset_count; return reset_count;
...@@ -357,7 +359,7 @@ static int igt_wait_reset(void *arg) ...@@ -357,7 +359,7 @@ static int igt_wait_reset(void *arg)
/* Check that we detect a stuck waiter and issue a reset */ /* Check that we detect a stuck waiter and issue a reset */
set_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags); set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
mutex_lock(&i915->drm.struct_mutex); mutex_lock(&i915->drm.struct_mutex);
err = hang_init(&h, i915); err = hang_init(&h, i915);
...@@ -388,8 +390,8 @@ static int igt_wait_reset(void *arg) ...@@ -388,8 +390,8 @@ static int igt_wait_reset(void *arg)
err = timeout; err = timeout;
goto out_rq; goto out_rq;
} }
GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags));
GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
if (i915_reset_count(&i915->gpu_error) == reset_count) { if (i915_reset_count(&i915->gpu_error) == reset_count) {
pr_err("No GPU reset recorded!\n"); pr_err("No GPU reset recorded!\n");
err = -EINVAL; err = -EINVAL;
...@@ -402,6 +404,7 @@ static int igt_wait_reset(void *arg) ...@@ -402,6 +404,7 @@ static int igt_wait_reset(void *arg)
hang_fini(&h); hang_fini(&h);
unlock: unlock:
mutex_unlock(&i915->drm.struct_mutex); mutex_unlock(&i915->drm.struct_mutex);
clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
if (i915_terminally_wedged(&i915->gpu_error)) if (i915_terminally_wedged(&i915->gpu_error))
return -EIO; return -EIO;
...@@ -422,6 +425,7 @@ static int igt_reset_queue(void *arg) ...@@ -422,6 +425,7 @@ static int igt_reset_queue(void *arg)
if (!igt_can_mi_store_dword_imm(i915)) if (!igt_can_mi_store_dword_imm(i915))
return 0; return 0;
set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
mutex_lock(&i915->drm.struct_mutex); mutex_lock(&i915->drm.struct_mutex);
err = hang_init(&h, i915); err = hang_init(&h, i915);
if (err) if (err)
...@@ -470,8 +474,9 @@ static int igt_reset_queue(void *arg) ...@@ -470,8 +474,9 @@ static int igt_reset_queue(void *arg)
i915_reset(i915); i915_reset(i915);
GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS, GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
&i915->gpu_error.flags)); &i915->gpu_error.flags));
if (prev->fence.error != -EIO) { if (prev->fence.error != -EIO) {
pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
prev->fence.error); prev->fence.error);
...@@ -514,6 +519,7 @@ static int igt_reset_queue(void *arg) ...@@ -514,6 +519,7 @@ static int igt_reset_queue(void *arg)
hang_fini(&h); hang_fini(&h);
unlock: unlock:
mutex_unlock(&i915->drm.struct_mutex); mutex_unlock(&i915->drm.struct_mutex);
clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
if (i915_terminally_wedged(&i915->gpu_error)) if (i915_terminally_wedged(&i915->gpu_error))
return -EIO; return -EIO;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment