Commit ce800754 authored by Chris Wilson's avatar Chris Wilson

drm/i915: Add control flags to i915_handle_error()

Not all callers want the GPU error to handled in the same way, so expose
a control parameter. In the first instance, some callers do not want the
heavyweight error capture so add a bit to request the state to be
captured and saved.

v2: Pass msg down to i915_reset/i915_reset_engine so that we include the
reason for the reset in the dev_notice(), superseding the earlier option
to not print that notice.
v3: Stash the reason inside the i915->gpu_error to handover to the direct
reset from the blocking waiter.
Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Cc: Jeff McGee <jeff.mcgee@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
Reviewed-by: default avatarMichel Thierry <michel.thierry@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20180320100449.1360-2-chris@chris-wilson.co.uk
parent ca98317b
...@@ -4011,8 +4011,8 @@ i915_wedged_set(void *data, u64 val) ...@@ -4011,8 +4011,8 @@ i915_wedged_set(void *data, u64 val)
engine->hangcheck.stalled = true; engine->hangcheck.stalled = true;
} }
i915_handle_error(i915, val, "Manually set wedged engine mask = %llx", i915_handle_error(i915, val, I915_ERROR_CAPTURE,
val); "Manually set wedged engine mask = %llx", val);
wait_on_bit(&i915->gpu_error.flags, wait_on_bit(&i915->gpu_error.flags,
I915_RESET_HANDOFF, I915_RESET_HANDOFF,
......
...@@ -1873,7 +1873,6 @@ static int i915_resume_switcheroo(struct drm_device *dev) ...@@ -1873,7 +1873,6 @@ static int i915_resume_switcheroo(struct drm_device *dev)
/** /**
* i915_reset - reset chip after a hang * i915_reset - reset chip after a hang
* @i915: #drm_i915_private to reset * @i915: #drm_i915_private to reset
* @flags: Instructions
* *
* Reset the chip. Useful if a hang is detected. Marks the device as wedged * Reset the chip. Useful if a hang is detected. Marks the device as wedged
* on failure. * on failure.
...@@ -1888,7 +1887,7 @@ static int i915_resume_switcheroo(struct drm_device *dev) ...@@ -1888,7 +1887,7 @@ static int i915_resume_switcheroo(struct drm_device *dev)
* - re-init interrupt state * - re-init interrupt state
* - re-init display * - re-init display
*/ */
void i915_reset(struct drm_i915_private *i915, unsigned int flags) void i915_reset(struct drm_i915_private *i915)
{ {
struct i915_gpu_error *error = &i915->gpu_error; struct i915_gpu_error *error = &i915->gpu_error;
int ret; int ret;
...@@ -1905,8 +1904,9 @@ void i915_reset(struct drm_i915_private *i915, unsigned int flags) ...@@ -1905,8 +1904,9 @@ void i915_reset(struct drm_i915_private *i915, unsigned int flags)
if (!i915_gem_unset_wedged(i915)) if (!i915_gem_unset_wedged(i915))
goto wakeup; goto wakeup;
if (!(flags & I915_RESET_QUIET)) if (error->reason)
dev_notice(i915->drm.dev, "Resetting chip after gpu hang\n"); dev_notice(i915->drm.dev,
"Resetting chip for %s\n", error->reason);
error->reset_count++; error->reset_count++;
disable_irq(i915->drm.irq); disable_irq(i915->drm.irq);
...@@ -2007,7 +2007,7 @@ static inline int intel_gt_reset_engine(struct drm_i915_private *dev_priv, ...@@ -2007,7 +2007,7 @@ static inline int intel_gt_reset_engine(struct drm_i915_private *dev_priv,
/** /**
* i915_reset_engine - reset GPU engine to recover from a hang * i915_reset_engine - reset GPU engine to recover from a hang
* @engine: engine to reset * @engine: engine to reset
* @flags: options * @msg: reason for GPU reset; or NULL for no dev_notice()
* *
* Reset a specific GPU engine. Useful if a hang is detected. * Reset a specific GPU engine. Useful if a hang is detected.
* Returns zero on successful reset or otherwise an error code. * Returns zero on successful reset or otherwise an error code.
...@@ -2017,7 +2017,7 @@ static inline int intel_gt_reset_engine(struct drm_i915_private *dev_priv, ...@@ -2017,7 +2017,7 @@ static inline int intel_gt_reset_engine(struct drm_i915_private *dev_priv,
* - reset engine (which will force the engine to idle) * - reset engine (which will force the engine to idle)
* - re-init/configure engine * - re-init/configure engine
*/ */
int i915_reset_engine(struct intel_engine_cs *engine, unsigned int flags) int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
{ {
struct i915_gpu_error *error = &engine->i915->gpu_error; struct i915_gpu_error *error = &engine->i915->gpu_error;
struct i915_request *active_request; struct i915_request *active_request;
...@@ -2032,10 +2032,9 @@ int i915_reset_engine(struct intel_engine_cs *engine, unsigned int flags) ...@@ -2032,10 +2032,9 @@ int i915_reset_engine(struct intel_engine_cs *engine, unsigned int flags)
goto out; goto out;
} }
if (!(flags & I915_RESET_QUIET)) { if (msg)
dev_notice(engine->i915->drm.dev, dev_notice(engine->i915->drm.dev,
"Resetting %s after gpu hang\n", engine->name); "Resetting %s for %s\n", engine->name, msg);
}
error->reset_engine_count[engine->id]++; error->reset_engine_count[engine->id]++;
if (!engine->i915->guc.execbuf_client) if (!engine->i915->guc.execbuf_client)
......
...@@ -2700,10 +2700,8 @@ extern void i915_driver_unload(struct drm_device *dev); ...@@ -2700,10 +2700,8 @@ extern void i915_driver_unload(struct drm_device *dev);
extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask); extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask);
extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv); extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv);
#define I915_RESET_QUIET BIT(0) extern void i915_reset(struct drm_i915_private *i915);
extern void i915_reset(struct drm_i915_private *i915, unsigned int flags); extern int i915_reset_engine(struct intel_engine_cs *engine, const char *msg);
extern int i915_reset_engine(struct intel_engine_cs *engine,
unsigned int flags);
extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv); extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv);
extern int intel_reset_guc(struct drm_i915_private *dev_priv); extern int intel_reset_guc(struct drm_i915_private *dev_priv);
...@@ -2751,10 +2749,12 @@ static inline void i915_queue_hangcheck(struct drm_i915_private *dev_priv) ...@@ -2751,10 +2749,12 @@ static inline void i915_queue_hangcheck(struct drm_i915_private *dev_priv)
&dev_priv->gpu_error.hangcheck_work, delay); &dev_priv->gpu_error.hangcheck_work, delay);
} }
__printf(3, 4) __printf(4, 5)
void i915_handle_error(struct drm_i915_private *dev_priv, void i915_handle_error(struct drm_i915_private *dev_priv,
u32 engine_mask, u32 engine_mask,
unsigned long flags,
const char *fmt, ...); const char *fmt, ...);
#define I915_ERROR_CAPTURE BIT(0)
extern void intel_irq_init(struct drm_i915_private *dev_priv); extern void intel_irq_init(struct drm_i915_private *dev_priv);
extern void intel_irq_fini(struct drm_i915_private *dev_priv); extern void intel_irq_fini(struct drm_i915_private *dev_priv);
......
...@@ -269,6 +269,9 @@ struct i915_gpu_error { ...@@ -269,6 +269,9 @@ struct i915_gpu_error {
/** Number of times an engine has been reset */ /** Number of times an engine has been reset */
u32 reset_engine_count[I915_NUM_ENGINES]; u32 reset_engine_count[I915_NUM_ENGINES];
/** Reason for the current *global* reset */
const char *reason;
/** /**
* Waitqueue to signal when a hang is detected. Used to for waiters * Waitqueue to signal when a hang is detected. Used to for waiters
* to release the struct_mutex for the reset to procede. * to release the struct_mutex for the reset to procede.
......
...@@ -2877,15 +2877,10 @@ static irqreturn_t gen11_irq_handler(int irq, void *arg) ...@@ -2877,15 +2877,10 @@ static irqreturn_t gen11_irq_handler(int irq, void *arg)
return IRQ_HANDLED; return IRQ_HANDLED;
} }
/** static void i915_reset_device(struct drm_i915_private *dev_priv,
* i915_reset_device - do process context error handling work const char *msg)
* @dev_priv: i915 device private
*
* Fire an error uevent so userspace can see that a hang or error
* was detected.
*/
static void i915_reset_device(struct drm_i915_private *dev_priv)
{ {
struct i915_gpu_error *error = &dev_priv->gpu_error;
struct kobject *kobj = &dev_priv->drm.primary->kdev->kobj; struct kobject *kobj = &dev_priv->drm.primary->kdev->kobj;
char *error_event[] = { I915_ERROR_UEVENT "=1", NULL }; char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
char *reset_event[] = { I915_RESET_UEVENT "=1", NULL }; char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
...@@ -2901,29 +2896,32 @@ static void i915_reset_device(struct drm_i915_private *dev_priv) ...@@ -2901,29 +2896,32 @@ static void i915_reset_device(struct drm_i915_private *dev_priv)
i915_wedge_on_timeout(&w, dev_priv, 5*HZ) { i915_wedge_on_timeout(&w, dev_priv, 5*HZ) {
intel_prepare_reset(dev_priv); intel_prepare_reset(dev_priv);
error->reason = msg;
/* Signal that locked waiters should reset the GPU */ /* Signal that locked waiters should reset the GPU */
set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags); set_bit(I915_RESET_HANDOFF, &error->flags);
wake_up_all(&dev_priv->gpu_error.wait_queue); wake_up_all(&error->wait_queue);
/* Wait for anyone holding the lock to wakeup, without /* Wait for anyone holding the lock to wakeup, without
* blocking indefinitely on struct_mutex. * blocking indefinitely on struct_mutex.
*/ */
do { do {
if (mutex_trylock(&dev_priv->drm.struct_mutex)) { if (mutex_trylock(&dev_priv->drm.struct_mutex)) {
i915_reset(dev_priv, 0); i915_reset(dev_priv);
mutex_unlock(&dev_priv->drm.struct_mutex); mutex_unlock(&dev_priv->drm.struct_mutex);
} }
} while (wait_on_bit_timeout(&dev_priv->gpu_error.flags, } while (wait_on_bit_timeout(&error->flags,
I915_RESET_HANDOFF, I915_RESET_HANDOFF,
TASK_UNINTERRUPTIBLE, TASK_UNINTERRUPTIBLE,
1)); 1));
error->reason = NULL;
intel_finish_reset(dev_priv); intel_finish_reset(dev_priv);
} }
if (!test_bit(I915_WEDGED, &dev_priv->gpu_error.flags)) if (!test_bit(I915_WEDGED, &error->flags))
kobject_uevent_env(kobj, kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
KOBJ_CHANGE, reset_done_event);
} }
static void i915_clear_error_registers(struct drm_i915_private *dev_priv) static void i915_clear_error_registers(struct drm_i915_private *dev_priv)
...@@ -2955,6 +2953,7 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv) ...@@ -2955,6 +2953,7 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv)
* i915_handle_error - handle a gpu error * i915_handle_error - handle a gpu error
* @dev_priv: i915 device private * @dev_priv: i915 device private
* @engine_mask: mask representing engines that are hung * @engine_mask: mask representing engines that are hung
* @flags: control flags
* @fmt: Error message format string * @fmt: Error message format string
* *
* Do some basic checking of register state at error time and * Do some basic checking of register state at error time and
...@@ -2965,17 +2964,24 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv) ...@@ -2965,17 +2964,24 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv)
*/ */
void i915_handle_error(struct drm_i915_private *dev_priv, void i915_handle_error(struct drm_i915_private *dev_priv,
u32 engine_mask, u32 engine_mask,
unsigned long flags,
const char *fmt, ...) const char *fmt, ...)
{ {
struct intel_engine_cs *engine; struct intel_engine_cs *engine;
unsigned int tmp; unsigned int tmp;
va_list args;
char error_msg[80]; char error_msg[80];
char *msg = NULL;
if (fmt) {
va_list args;
va_start(args, fmt); va_start(args, fmt);
vscnprintf(error_msg, sizeof(error_msg), fmt, args); vscnprintf(error_msg, sizeof(error_msg), fmt, args);
va_end(args); va_end(args);
msg = error_msg;
}
/* /*
* In most cases it's guaranteed that we get here with an RPM * In most cases it's guaranteed that we get here with an RPM
* reference held, for example because there is a pending GPU * reference held, for example because there is a pending GPU
...@@ -2986,8 +2992,11 @@ void i915_handle_error(struct drm_i915_private *dev_priv, ...@@ -2986,8 +2992,11 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
intel_runtime_pm_get(dev_priv); intel_runtime_pm_get(dev_priv);
engine_mask &= INTEL_INFO(dev_priv)->ring_mask; engine_mask &= INTEL_INFO(dev_priv)->ring_mask;
i915_capture_error_state(dev_priv, engine_mask, error_msg);
if (flags & I915_ERROR_CAPTURE) {
i915_capture_error_state(dev_priv, engine_mask, msg);
i915_clear_error_registers(dev_priv); i915_clear_error_registers(dev_priv);
}
/* /*
* Try engine reset when available. We fall back to full reset if * Try engine reset when available. We fall back to full reset if
...@@ -3000,7 +3009,7 @@ void i915_handle_error(struct drm_i915_private *dev_priv, ...@@ -3000,7 +3009,7 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
&dev_priv->gpu_error.flags)) &dev_priv->gpu_error.flags))
continue; continue;
if (i915_reset_engine(engine, 0) == 0) if (i915_reset_engine(engine, msg) == 0)
engine_mask &= ~intel_engine_flag(engine); engine_mask &= ~intel_engine_flag(engine);
clear_bit(I915_RESET_ENGINE + engine->id, clear_bit(I915_RESET_ENGINE + engine->id,
...@@ -3030,7 +3039,7 @@ void i915_handle_error(struct drm_i915_private *dev_priv, ...@@ -3030,7 +3039,7 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
TASK_UNINTERRUPTIBLE); TASK_UNINTERRUPTIBLE);
} }
i915_reset_device(dev_priv); i915_reset_device(dev_priv, msg);
for_each_engine(engine, dev_priv, tmp) { for_each_engine(engine, dev_priv, tmp) {
clear_bit(I915_RESET_ENGINE + engine->id, clear_bit(I915_RESET_ENGINE + engine->id,
......
...@@ -1229,7 +1229,7 @@ static bool __i915_wait_request_check_and_reset(struct i915_request *request) ...@@ -1229,7 +1229,7 @@ static bool __i915_wait_request_check_and_reset(struct i915_request *request)
return false; return false;
__set_current_state(TASK_RUNNING); __set_current_state(TASK_RUNNING);
i915_reset(request->i915, 0); i915_reset(request->i915);
return true; return true;
} }
......
...@@ -246,9 +246,8 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd) ...@@ -246,9 +246,8 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
*/ */
tmp = I915_READ_CTL(engine); tmp = I915_READ_CTL(engine);
if (tmp & RING_WAIT) { if (tmp & RING_WAIT) {
i915_handle_error(dev_priv, BIT(engine->id), i915_handle_error(dev_priv, BIT(engine->id), 0,
"Kicking stuck wait on %s", "stuck wait on %s", engine->name);
engine->name);
I915_WRITE_CTL(engine, tmp); I915_WRITE_CTL(engine, tmp);
return ENGINE_WAIT_KICK; return ENGINE_WAIT_KICK;
} }
...@@ -258,8 +257,8 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd) ...@@ -258,8 +257,8 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
default: default:
return ENGINE_DEAD; return ENGINE_DEAD;
case 1: case 1:
i915_handle_error(dev_priv, ALL_ENGINES, i915_handle_error(dev_priv, ALL_ENGINES, 0,
"Kicking stuck semaphore on %s", "stuck semaphore on %s",
engine->name); engine->name);
I915_WRITE_CTL(engine, tmp); I915_WRITE_CTL(engine, tmp);
return ENGINE_WAIT_KICK; return ENGINE_WAIT_KICK;
...@@ -386,13 +385,13 @@ static void hangcheck_declare_hang(struct drm_i915_private *i915, ...@@ -386,13 +385,13 @@ static void hangcheck_declare_hang(struct drm_i915_private *i915,
if (stuck != hung) if (stuck != hung)
hung &= ~stuck; hung &= ~stuck;
len = scnprintf(msg, sizeof(msg), len = scnprintf(msg, sizeof(msg),
"%s on ", stuck == hung ? "No progress" : "Hang"); "%s on ", stuck == hung ? "no progress" : "hang");
for_each_engine_masked(engine, i915, hung, tmp) for_each_engine_masked(engine, i915, hung, tmp)
len += scnprintf(msg + len, sizeof(msg) - len, len += scnprintf(msg + len, sizeof(msg) - len,
"%s, ", engine->name); "%s, ", engine->name);
msg[len-2] = '\0'; msg[len-2] = '\0';
return i915_handle_error(i915, hung, "%s", msg); return i915_handle_error(i915, hung, I915_ERROR_CAPTURE, "%s", msg);
} }
/* /*
......
...@@ -433,7 +433,7 @@ static int igt_global_reset(void *arg) ...@@ -433,7 +433,7 @@ static int igt_global_reset(void *arg)
mutex_lock(&i915->drm.struct_mutex); mutex_lock(&i915->drm.struct_mutex);
reset_count = i915_reset_count(&i915->gpu_error); reset_count = i915_reset_count(&i915->gpu_error);
i915_reset(i915, I915_RESET_QUIET); i915_reset(i915);
if (i915_reset_count(&i915->gpu_error) == reset_count) { if (i915_reset_count(&i915->gpu_error) == reset_count) {
pr_err("No GPU reset recorded!\n"); pr_err("No GPU reset recorded!\n");
...@@ -518,7 +518,7 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active) ...@@ -518,7 +518,7 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
engine->hangcheck.seqno = engine->hangcheck.seqno =
intel_engine_get_seqno(engine); intel_engine_get_seqno(engine);
err = i915_reset_engine(engine, I915_RESET_QUIET); err = i915_reset_engine(engine, NULL);
if (err) { if (err) {
pr_err("i915_reset_engine failed\n"); pr_err("i915_reset_engine failed\n");
break; break;
...@@ -725,7 +725,7 @@ static int __igt_reset_engine_others(struct drm_i915_private *i915, ...@@ -725,7 +725,7 @@ static int __igt_reset_engine_others(struct drm_i915_private *i915,
engine->hangcheck.seqno = engine->hangcheck.seqno =
intel_engine_get_seqno(engine); intel_engine_get_seqno(engine);
err = i915_reset_engine(engine, I915_RESET_QUIET); err = i915_reset_engine(engine, NULL);
if (err) { if (err) {
pr_err("i915_reset_engine(%s:%s) failed, err=%d\n", pr_err("i915_reset_engine(%s:%s) failed, err=%d\n",
engine->name, active ? "active" : "idle", err); engine->name, active ? "active" : "idle", err);
...@@ -865,7 +865,6 @@ static int igt_wait_reset(void *arg) ...@@ -865,7 +865,6 @@ static int igt_wait_reset(void *arg)
__func__, rq->fence.seqno, hws_seqno(&h, rq)); __func__, rq->fence.seqno, hws_seqno(&h, rq));
intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
i915_reset(i915, 0);
i915_gem_set_wedged(i915); i915_gem_set_wedged(i915);
err = -EIO; err = -EIO;
...@@ -962,7 +961,6 @@ static int igt_reset_queue(void *arg) ...@@ -962,7 +961,6 @@ static int igt_reset_queue(void *arg)
i915_request_put(rq); i915_request_put(rq);
i915_request_put(prev); i915_request_put(prev);
i915_reset(i915, 0);
i915_gem_set_wedged(i915); i915_gem_set_wedged(i915);
err = -EIO; err = -EIO;
...@@ -971,7 +969,7 @@ static int igt_reset_queue(void *arg) ...@@ -971,7 +969,7 @@ static int igt_reset_queue(void *arg)
reset_count = fake_hangcheck(prev); reset_count = fake_hangcheck(prev);
i915_reset(i915, I915_RESET_QUIET); i915_reset(i915);
GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
&i915->gpu_error.flags)); &i915->gpu_error.flags));
...@@ -1069,7 +1067,6 @@ static int igt_handle_error(void *arg) ...@@ -1069,7 +1067,6 @@ static int igt_handle_error(void *arg)
__func__, rq->fence.seqno, hws_seqno(&h, rq)); __func__, rq->fence.seqno, hws_seqno(&h, rq));
intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
i915_reset(i915, 0);
i915_gem_set_wedged(i915); i915_gem_set_wedged(i915);
err = -EIO; err = -EIO;
...@@ -1084,7 +1081,7 @@ static int igt_handle_error(void *arg) ...@@ -1084,7 +1081,7 @@ static int igt_handle_error(void *arg)
engine->hangcheck.stalled = true; engine->hangcheck.stalled = true;
engine->hangcheck.seqno = intel_engine_get_seqno(engine); engine->hangcheck.seqno = intel_engine_get_seqno(engine);
i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__); i915_handle_error(i915, intel_engine_flag(engine), 0, NULL);
xchg(&i915->gpu_error.first_error, error); xchg(&i915->gpu_error.first_error, error);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment