Commit 4fa6053e authored by Chris Wilson's avatar Chris Wilson

drm/i915: Record more information about the hanging contexts

Include extra information such as the user_handle and hw_id so that
userspace can identify which of their contexts hung, useful if they are
performing self-diagnositics.
Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20170129092433.10483-1-chris@chris-wilson.co.ukReviewed-by: default avatarMika Kuoppala <mika.kuoppala@intel.com>
parent 0102ba1f
...@@ -969,6 +969,16 @@ struct drm_i915_error_state { ...@@ -969,6 +969,16 @@ struct drm_i915_error_state {
u32 semaphore_mboxes[I915_NUM_ENGINES - 1]; u32 semaphore_mboxes[I915_NUM_ENGINES - 1];
struct intel_instdone instdone; struct intel_instdone instdone;
struct drm_i915_error_context {
char comm[TASK_COMM_LEN];
pid_t pid;
u32 handle;
u32 hw_id;
int ban_score;
int active;
int guilty;
} context;
struct drm_i915_error_object { struct drm_i915_error_object {
u64 gtt_offset; u64 gtt_offset;
u64 gtt_size; u64 gtt_size;
...@@ -1002,10 +1012,6 @@ struct drm_i915_error_state { ...@@ -1002,10 +1012,6 @@ struct drm_i915_error_state {
u32 pp_dir_base; u32 pp_dir_base;
}; };
} vm_info; } vm_info;
pid_t pid;
char comm[TASK_COMM_LEN];
int context_bans;
} engine[I915_NUM_ENGINES]; } engine[I915_NUM_ENGINES];
struct drm_i915_error_buffer { struct drm_i915_error_buffer {
......
...@@ -384,6 +384,15 @@ static void error_print_request(struct drm_i915_error_state_buf *m, ...@@ -384,6 +384,15 @@ static void error_print_request(struct drm_i915_error_state_buf *m,
erq->head, erq->tail); erq->head, erq->tail);
} }
static void error_print_context(struct drm_i915_error_state_buf *m,
const char *header,
struct drm_i915_error_context *ctx)
{
err_printf(m, "%s%s[%d] user_handle %d hw_id %d, ban score %d guilty %d active %d\n",
header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
ctx->ban_score, ctx->guilty, ctx->active);
}
static void error_print_engine(struct drm_i915_error_state_buf *m, static void error_print_engine(struct drm_i915_error_state_buf *m,
struct drm_i915_error_engine *ee) struct drm_i915_error_engine *ee)
{ {
...@@ -457,6 +466,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m, ...@@ -457,6 +466,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
error_print_request(m, " ELSP[0]: ", &ee->execlist[0]); error_print_request(m, " ELSP[0]: ", &ee->execlist[0]);
error_print_request(m, " ELSP[1]: ", &ee->execlist[1]); error_print_request(m, " ELSP[1]: ", &ee->execlist[1]);
error_print_context(m, " Active context: ", &ee->context);
} }
void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...) void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
...@@ -562,12 +572,12 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, ...@@ -562,12 +572,12 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
for (i = 0; i < ARRAY_SIZE(error->engine); i++) { for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
if (error->engine[i].hangcheck_stalled && if (error->engine[i].hangcheck_stalled &&
error->engine[i].pid != -1) { error->engine[i].context.pid) {
err_printf(m, "Active process (on ring %s): %s [%d], context bans %d\n", err_printf(m, "Active process (on ring %s): %s [%d], score %d\n",
engine_str(i), engine_str(i),
error->engine[i].comm, error->engine[i].context.comm,
error->engine[i].pid, error->engine[i].context.pid,
error->engine[i].context_bans); error->engine[i].context.ban_score);
} }
} }
err_printf(m, "Reset count: %u\n", error->reset_count); err_printf(m, "Reset count: %u\n", error->reset_count);
...@@ -658,11 +668,13 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, ...@@ -658,11 +668,13 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
obj = ee->batchbuffer; obj = ee->batchbuffer;
if (obj) { if (obj) {
err_puts(m, dev_priv->engine[i]->name); err_puts(m, dev_priv->engine[i]->name);
if (ee->pid != -1) if (ee->context.pid)
err_printf(m, " (submitted by %s [%d], bans %d)", err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d)",
ee->comm, ee->context.comm,
ee->pid, ee->context.pid,
ee->context_bans); ee->context.handle,
ee->context.hw_id,
ee->context.ban_score);
err_printf(m, " --- gtt_offset = 0x%08x %08x\n", err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
upper_32_bits(obj->gtt_offset), upper_32_bits(obj->gtt_offset),
lower_32_bits(obj->gtt_offset)); lower_32_bits(obj->gtt_offset));
...@@ -1267,6 +1279,28 @@ static void error_record_engine_execlists(struct intel_engine_cs *engine, ...@@ -1267,6 +1279,28 @@ static void error_record_engine_execlists(struct intel_engine_cs *engine,
&ee->execlist[n]); &ee->execlist[n]);
} }
static void record_context(struct drm_i915_error_context *e,
struct i915_gem_context *ctx)
{
if (ctx->pid) {
struct task_struct *task;
rcu_read_lock();
task = pid_task(ctx->pid, PIDTYPE_PID);
if (task) {
strcpy(e->comm, task->comm);
e->pid = task->pid;
}
rcu_read_unlock();
}
e->handle = ctx->user_handle;
e->hw_id = ctx->hw_id;
e->ban_score = ctx->ban_score;
e->guilty = ctx->guilty_count;
e->active = ctx->active_count;
}
static void i915_gem_record_rings(struct drm_i915_private *dev_priv, static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
struct drm_i915_error_state *error) struct drm_i915_error_state *error)
{ {
...@@ -1281,7 +1315,6 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv, ...@@ -1281,7 +1315,6 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
struct drm_i915_error_engine *ee = &error->engine[i]; struct drm_i915_error_engine *ee = &error->engine[i];
struct drm_i915_gem_request *request; struct drm_i915_gem_request *request;
ee->pid = -1;
ee->engine_id = -1; ee->engine_id = -1;
if (!engine) if (!engine)
...@@ -1296,11 +1329,12 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv, ...@@ -1296,11 +1329,12 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
request = i915_gem_find_active_request(engine); request = i915_gem_find_active_request(engine);
if (request) { if (request) {
struct intel_ring *ring; struct intel_ring *ring;
struct pid *pid;
ee->vm = request->ctx->ppgtt ? ee->vm = request->ctx->ppgtt ?
&request->ctx->ppgtt->base : &ggtt->base; &request->ctx->ppgtt->base : &ggtt->base;
record_context(&ee->context, request->ctx);
/* We need to copy these to an anonymous buffer /* We need to copy these to an anonymous buffer
* as the simplest method to avoid being overwritten * as the simplest method to avoid being overwritten
* by userspace. * by userspace.
...@@ -1318,19 +1352,6 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv, ...@@ -1318,19 +1352,6 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
i915_error_object_create(dev_priv, i915_error_object_create(dev_priv,
request->ctx->engine[i].state); request->ctx->engine[i].state);
pid = request->ctx->pid;
if (pid) {
struct task_struct *task;
rcu_read_lock();
task = pid_task(pid, PIDTYPE_PID);
if (task) {
strcpy(ee->comm, task->comm);
ee->pid = task->pid;
}
rcu_read_unlock();
}
error->simulated |= error->simulated |=
i915_gem_context_no_error_capture(request->ctx); i915_gem_context_no_error_capture(request->ctx);
...@@ -1534,12 +1555,12 @@ static void i915_error_capture_msg(struct drm_i915_private *dev_priv, ...@@ -1534,12 +1555,12 @@ static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
"GPU HANG: ecode %d:%d:0x%08x", "GPU HANG: ecode %d:%d:0x%08x",
INTEL_GEN(dev_priv), engine_id, ecode); INTEL_GEN(dev_priv), engine_id, ecode);
if (engine_id != -1 && error->engine[engine_id].pid != -1) if (engine_id != -1 && error->engine[engine_id].context.pid)
len += scnprintf(error->error_msg + len, len += scnprintf(error->error_msg + len,
sizeof(error->error_msg) - len, sizeof(error->error_msg) - len,
", in %s [%d]", ", in %s [%d]",
error->engine[engine_id].comm, error->engine[engine_id].context.comm,
error->engine[engine_id].pid); error->engine[engine_id].context.pid);
scnprintf(error->error_msg + len, sizeof(error->error_msg) - len, scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
", reason: %s, action: %s", ", reason: %s, action: %s",
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment