Commit 372fbb8e authored by Chris Wilson's avatar Chris Wilson Committed by Daniel Vetter

drm/i915: Decouple GPU error reporting from ring initialisation

Currently we report through our error state only the rings that have
been initialised (as detected by ring->obj). This check is done after
the GPU reset and ring re-initialisation, which means that the software
state may not be the same as when we captured the hardware error and we
may not print out any of the vital information for debugging the hang.

This (and the implied object leak) is a regression from

commit 3d57e5bd
Author: Ben Widawsky <ben@bwidawsk.net>
Date:   Mon Oct 14 10:01:36 2013 -0700

    drm/i915: Do a fuller init after reset

Note that we are already starting to get bug reports with incomplete
error states from 3.13, which also hampers debugging userspace driver
issues.

v2: Prevent a NULL dereference on 830gm/845g after a GPU reset where
    the scratch obj may be NULL.
Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Cc: Ben Widawsky <ben@bwidawsk.net>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
References: https://bugs.freedesktop.org/show_bug.cgi?id=74094
Cc: stable@vger.kernel.org # please don't delay since it's a
vital support/debug feature for the intel gfx stack in general
Reviewed-by: default avatarVille Syrjälä <ville.syrjala@linux.intel.com>
[danvet: Add a bit of fluff to make it clear we need this expedited in
stable.]
Signed-off-by: default avatarDaniel Vetter <daniel.vetter@ffwll.ch>
parent 22accca0
...@@ -330,6 +330,7 @@ struct drm_i915_error_state { ...@@ -330,6 +330,7 @@ struct drm_i915_error_state {
u64 fence[I915_MAX_NUM_FENCES]; u64 fence[I915_MAX_NUM_FENCES];
struct timeval time; struct timeval time;
struct drm_i915_error_ring { struct drm_i915_error_ring {
bool valid;
struct drm_i915_error_object { struct drm_i915_error_object {
int page_count; int page_count;
u32 gtt_offset; u32 gtt_offset;
......
...@@ -239,6 +239,9 @@ static void i915_ring_error_state(struct drm_i915_error_state_buf *m, ...@@ -239,6 +239,9 @@ static void i915_ring_error_state(struct drm_i915_error_state_buf *m,
unsigned ring) unsigned ring)
{ {
BUG_ON(ring >= I915_NUM_RINGS); /* shut up confused gcc */ BUG_ON(ring >= I915_NUM_RINGS); /* shut up confused gcc */
if (!error->ring[ring].valid)
return;
err_printf(m, "%s command stream:\n", ring_str(ring)); err_printf(m, "%s command stream:\n", ring_str(ring));
err_printf(m, " HEAD: 0x%08x\n", error->head[ring]); err_printf(m, " HEAD: 0x%08x\n", error->head[ring]);
err_printf(m, " TAIL: 0x%08x\n", error->tail[ring]); err_printf(m, " TAIL: 0x%08x\n", error->tail[ring]);
...@@ -293,7 +296,6 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, ...@@ -293,7 +296,6 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
struct drm_device *dev = error_priv->dev; struct drm_device *dev = error_priv->dev;
drm_i915_private_t *dev_priv = dev->dev_private; drm_i915_private_t *dev_priv = dev->dev_private;
struct drm_i915_error_state *error = error_priv->error; struct drm_i915_error_state *error = error_priv->error;
struct intel_ring_buffer *ring;
int i, j, page, offset, elt; int i, j, page, offset, elt;
if (!error) { if (!error) {
...@@ -328,7 +330,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, ...@@ -328,7 +330,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
if (INTEL_INFO(dev)->gen == 7) if (INTEL_INFO(dev)->gen == 7)
err_printf(m, "ERR_INT: 0x%08x\n", error->err_int); err_printf(m, "ERR_INT: 0x%08x\n", error->err_int);
for_each_ring(ring, dev_priv, i) for (i = 0; i < ARRAY_SIZE(error->ring); i++)
i915_ring_error_state(m, dev, error, i); i915_ring_error_state(m, dev, error, i);
if (error->active_bo) if (error->active_bo)
...@@ -385,8 +387,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, ...@@ -385,8 +387,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
} }
} }
obj = error->ring[i].ctx; if ((obj = error->ring[i].ctx)) {
if (obj) {
err_printf(m, "%s --- HW Context = 0x%08x\n", err_printf(m, "%s --- HW Context = 0x%08x\n",
dev_priv->ring[i].name, dev_priv->ring[i].name,
obj->gtt_offset); obj->gtt_offset);
...@@ -667,7 +668,8 @@ i915_error_first_batchbuffer(struct drm_i915_private *dev_priv, ...@@ -667,7 +668,8 @@ i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
return NULL; return NULL;
obj = ring->scratch.obj; obj = ring->scratch.obj;
if (acthd >= i915_gem_obj_ggtt_offset(obj) && if (obj != NULL &&
acthd >= i915_gem_obj_ggtt_offset(obj) &&
acthd < i915_gem_obj_ggtt_offset(obj) + obj->base.size) acthd < i915_gem_obj_ggtt_offset(obj) + obj->base.size)
return i915_error_object_create(dev_priv, obj); return i915_error_object_create(dev_priv, obj);
} }
...@@ -775,11 +777,17 @@ static void i915_gem_record_rings(struct drm_device *dev, ...@@ -775,11 +777,17 @@ static void i915_gem_record_rings(struct drm_device *dev,
struct drm_i915_error_state *error) struct drm_i915_error_state *error)
{ {
struct drm_i915_private *dev_priv = dev->dev_private; struct drm_i915_private *dev_priv = dev->dev_private;
struct intel_ring_buffer *ring;
struct drm_i915_gem_request *request; struct drm_i915_gem_request *request;
int i, count; int i, count;
for_each_ring(ring, dev_priv, i) { for (i = 0; i < I915_NUM_RINGS; i++) {
struct intel_ring_buffer *ring = &dev_priv->ring[i];
if (ring->dev == NULL)
continue;
error->ring[i].valid = true;
i915_record_ring_state(dev, error, ring); i915_record_ring_state(dev, error, ring);
error->ring[i].batchbuffer = error->ring[i].batchbuffer =
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment