Commit eb8d0f5a authored by Chris Wilson's avatar Chris Wilson

drm/i915: Remove GPU reset dependence on struct_mutex

Now that the submission backends are controlled via their own spinlocks,
with a wave of a magic wand we can lift the struct_mutex requirement
around GPU reset. That is we allow the submission frontend (userspace)
to keep on submitting while we process the GPU reset as we can suspend
the backend independently.

The major change is around the backoff/handoff strategy for performing
the reset. With no mutex deadlock, we no longer have to coordinate with
any waiter, and just perform the reset immediately.

Testcase: igt/gem_mmap_gtt/hang # regresses
Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: default avatarMika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190125132230.22221-3-chris@chris-wilson.co.uk
parent fe62365f
......@@ -1284,8 +1284,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
seq_puts(m, "Wedged\n");
if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags))
seq_puts(m, "Reset in progress: struct_mutex backoff\n");
if (test_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags))
seq_puts(m, "Reset in progress: reset handoff to waiter\n");
if (waitqueue_active(&dev_priv->gpu_error.wait_queue))
seq_puts(m, "Waiter holding struct mutex\n");
if (waitqueue_active(&dev_priv->gpu_error.reset_queue))
......@@ -1321,15 +1319,15 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
struct rb_node *rb;
seq_printf(m, "%s:\n", engine->name);
seq_printf(m, "\tseqno = %x [current %x, last %x]\n",
seq_printf(m, "\tseqno = %x [current %x, last %x], %dms ago\n",
engine->hangcheck.seqno, seqno[id],
intel_engine_last_submit(engine));
seq_printf(m, "\twaiters? %s, fake irq active? %s, stalled? %s, wedged? %s\n",
intel_engine_last_submit(engine),
jiffies_to_msecs(jiffies -
engine->hangcheck.action_timestamp));
seq_printf(m, "\twaiters? %s, fake irq active? %s\n",
yesno(intel_engine_has_waiter(engine)),
yesno(test_bit(engine->id,
&dev_priv->gpu_error.missed_irq_rings)),
yesno(engine->hangcheck.stalled),
yesno(engine->hangcheck.wedged));
&dev_priv->gpu_error.missed_irq_rings)));
spin_lock_irq(&b->rb_lock);
for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
......@@ -1343,11 +1341,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
(long long)engine->hangcheck.acthd,
(long long)acthd[id]);
seq_printf(m, "\taction = %s(%d) %d ms ago\n",
hangcheck_action_to_str(engine->hangcheck.action),
engine->hangcheck.action,
jiffies_to_msecs(jiffies -
engine->hangcheck.action_timestamp));
if (engine->id == RCS) {
seq_puts(m, "\tinstdone read =\n");
......@@ -3911,8 +3904,6 @@ static int
i915_wedged_set(void *data, u64 val)
{
struct drm_i915_private *i915 = data;
struct intel_engine_cs *engine;
unsigned int tmp;
/*
* There is no safeguard against this debugfs entry colliding
......@@ -3925,18 +3916,8 @@ i915_wedged_set(void *data, u64 val)
if (i915_reset_backoff(&i915->gpu_error))
return -EAGAIN;
for_each_engine_masked(engine, i915, val, tmp) {
engine->hangcheck.seqno = intel_engine_get_seqno(engine);
engine->hangcheck.stalled = true;
}
i915_handle_error(i915, val, I915_ERROR_CAPTURE,
"Manually set wedged engine mask = %llx", val);
wait_on_bit(&i915->gpu_error.flags,
I915_RESET_HANDOFF,
TASK_UNINTERRUPTIBLE);
return 0;
}
......@@ -4091,13 +4072,8 @@ i915_drop_caches_set(void *data, u64 val)
mutex_unlock(&i915->drm.struct_mutex);
}
if (val & DROP_RESET_ACTIVE &&
i915_terminally_wedged(&i915->gpu_error)) {
if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(&i915->gpu_error))
i915_handle_error(i915, ALL_ENGINES, 0, NULL);
wait_on_bit(&i915->gpu_error.flags,
I915_RESET_HANDOFF,
TASK_UNINTERRUPTIBLE);
}
fs_reclaim_acquire(GFP_KERNEL);
if (val & DROP_BOUND)
......
......@@ -3001,11 +3001,6 @@ static inline bool i915_reset_backoff(struct i915_gpu_error *error)
return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags));
}
static inline bool i915_reset_handoff(struct i915_gpu_error *error)
{
return unlikely(test_bit(I915_RESET_HANDOFF, &error->flags));
}
static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
{
return unlikely(test_bit(I915_WEDGED, &error->flags));
......
......@@ -657,11 +657,6 @@ i915_gem_object_wait(struct drm_i915_gem_object *obj,
struct intel_rps_client *rps_client)
{
might_sleep();
#if IS_ENABLED(CONFIG_LOCKDEP)
GEM_BUG_ON(debug_locks &&
!!lockdep_is_held(&obj->base.dev->struct_mutex) !=
!!(flags & I915_WAIT_LOCKED));
#endif
GEM_BUG_ON(timeout < 0);
timeout = i915_gem_object_wait_reservation(obj->resv,
......@@ -4493,8 +4488,6 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
GEM_TRACE("\n");
mutex_lock(&i915->drm.struct_mutex);
wakeref = intel_runtime_pm_get(i915);
intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
......@@ -4520,6 +4513,7 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
intel_runtime_pm_put(i915, wakeref);
mutex_lock(&i915->drm.struct_mutex);
i915_gem_contexts_lost(i915);
mutex_unlock(&i915->drm.struct_mutex);
}
......@@ -4534,6 +4528,8 @@ int i915_gem_suspend(struct drm_i915_private *i915)
wakeref = intel_runtime_pm_get(i915);
intel_suspend_gt_powersave(i915);
flush_workqueue(i915->wq);
mutex_lock(&i915->drm.struct_mutex);
/*
......@@ -4563,11 +4559,9 @@ int i915_gem_suspend(struct drm_i915_private *i915)
i915_retire_requests(i915); /* ensure we flush after wedging */
mutex_unlock(&i915->drm.struct_mutex);
i915_reset_flush(i915);
intel_uc_suspend(i915);
cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
cancel_delayed_work_sync(&i915->gt.retire_work);
drain_delayed_work(&i915->gt.retire_work);
/*
* As the idle_work is rearming if it detects a race, play safe and
......@@ -4575,6 +4569,8 @@ int i915_gem_suspend(struct drm_i915_private *i915)
*/
drain_delayed_work(&i915->gt.idle_work);
intel_uc_suspend(i915);
/*
* Assert that we successfully flushed all the work and
* reset the GPU back to its idle, low power state.
......
......@@ -50,4 +50,3 @@ struct drm_i915_fence_reg {
};
#endif
......@@ -39,6 +39,7 @@
#include <linux/pagevec.h>
#include "i915_request.h"
#include "i915_reset.h"
#include "i915_selftest.h"
#include "i915_timeline.h"
......
......@@ -533,10 +533,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
err_printf(m, " waiting: %s\n", yesno(ee->waiting));
err_printf(m, " ring->head: 0x%08x\n", ee->cpu_ring_head);
err_printf(m, " ring->tail: 0x%08x\n", ee->cpu_ring_tail);
err_printf(m, " hangcheck stall: %s\n", yesno(ee->hangcheck_stalled));
err_printf(m, " hangcheck action: %s\n",
hangcheck_action_to_str(ee->hangcheck_action));
err_printf(m, " hangcheck action timestamp: %dms (%lu%s)\n",
err_printf(m, " hangcheck timestamp: %dms (%lu%s)\n",
jiffies_to_msecs(ee->hangcheck_timestamp - epoch),
ee->hangcheck_timestamp,
ee->hangcheck_timestamp == epoch ? "; epoch" : "");
......@@ -684,15 +681,15 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
jiffies_to_msecs(error->capture - error->epoch));
for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
if (error->engine[i].hangcheck_stalled &&
error->engine[i].context.pid) {
err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
engine_name(m->i915, i),
error->engine[i].context.comm,
error->engine[i].context.pid,
error->engine[i].context.ban_score,
bannable(&error->engine[i].context));
}
if (!error->engine[i].context.pid)
continue;
err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
engine_name(m->i915, i),
error->engine[i].context.comm,
error->engine[i].context.pid,
error->engine[i].context.ban_score,
bannable(&error->engine[i].context));
}
err_printf(m, "Reset count: %u\n", error->reset_count);
err_printf(m, "Suspend count: %u\n", error->suspend_count);
......@@ -1144,7 +1141,8 @@ static u32 capture_error_bo(struct drm_i915_error_buffer *err,
return i;
}
/* Generate a semi-unique error code. The code is not meant to have meaning, The
/*
* Generate a semi-unique error code. The code is not meant to have meaning, The
* code's only purpose is to try to prevent false duplicated bug reports by
* grossly estimating a GPU error state.
*
......@@ -1153,29 +1151,23 @@ static u32 capture_error_bo(struct drm_i915_error_buffer *err,
*
* It's only a small step better than a random number in its current form.
*/
static u32 i915_error_generate_code(struct drm_i915_private *dev_priv,
struct i915_gpu_state *error,
int *engine_id)
static u32 i915_error_generate_code(struct i915_gpu_state *error,
unsigned long engine_mask)
{
u32 error_code = 0;
int i;
/* IPEHR would be an ideal way to detect errors, as it's the gross
/*
* IPEHR would be an ideal way to detect errors, as it's the gross
* measure of "the command that hung." However, has some very common
* synchronization commands which almost always appear in the case
* strictly a client bug. Use instdone to differentiate those some.
*/
for (i = 0; i < I915_NUM_ENGINES; i++) {
if (error->engine[i].hangcheck_stalled) {
if (engine_id)
*engine_id = i;
if (engine_mask) {
struct drm_i915_error_engine *ee =
&error->engine[ffs(engine_mask)];
return error->engine[i].ipehr ^
error->engine[i].instdone.instdone;
}
return ee->ipehr ^ ee->instdone.instdone;
}
return error_code;
return 0;
}
static void gem_record_fences(struct i915_gpu_state *error)
......@@ -1338,9 +1330,8 @@ static void error_record_engine_registers(struct i915_gpu_state *error,
}
ee->idle = intel_engine_is_idle(engine);
ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
ee->hangcheck_action = engine->hangcheck.action;
ee->hangcheck_stalled = engine->hangcheck.stalled;
if (!ee->idle)
ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error,
engine);
......@@ -1783,31 +1774,35 @@ static void capture_reg_state(struct i915_gpu_state *error)
error->pgtbl_er = I915_READ(PGTBL_ER);
}
static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
struct i915_gpu_state *error,
u32 engine_mask,
const char *error_msg)
static const char *
error_msg(struct i915_gpu_state *error, unsigned long engines, const char *msg)
{
u32 ecode;
int engine_id = -1, len;
int len;
int i;
ecode = i915_error_generate_code(dev_priv, error, &engine_id);
for (i = 0; i < ARRAY_SIZE(error->engine); i++)
if (!error->engine[i].context.pid)
engines &= ~BIT(i);
len = scnprintf(error->error_msg, sizeof(error->error_msg),
"GPU HANG: ecode %d:%d:0x%08x",
INTEL_GEN(dev_priv), engine_id, ecode);
if (engine_id != -1 && error->engine[engine_id].context.pid)
"GPU HANG: ecode %d:%lx:0x%08x",
INTEL_GEN(error->i915), engines,
i915_error_generate_code(error, engines));
if (engines) {
/* Just show the first executing process, more is confusing */
i = ffs(engines);
len += scnprintf(error->error_msg + len,
sizeof(error->error_msg) - len,
", in %s [%d]",
error->engine[engine_id].context.comm,
error->engine[engine_id].context.pid);
error->engine[i].context.comm,
error->engine[i].context.pid);
}
if (msg)
len += scnprintf(error->error_msg + len,
sizeof(error->error_msg) - len,
", %s", msg);
scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
", reason: %s, action: %s",
error_msg,
engine_mask ? "reset" : "continue");
return error->error_msg;
}
static void capture_gen_state(struct i915_gpu_state *error)
......@@ -1847,7 +1842,7 @@ static unsigned long capture_find_epoch(const struct i915_gpu_state *error)
for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
const struct drm_i915_error_engine *ee = &error->engine[i];
if (ee->hangcheck_stalled &&
if (ee->hangcheck_timestamp &&
time_before(ee->hangcheck_timestamp, epoch))
epoch = ee->hangcheck_timestamp;
}
......@@ -1921,7 +1916,7 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
* i915_capture_error_state - capture an error record for later analysis
* @i915: i915 device
* @engine_mask: the mask of engines triggering the hang
* @error_msg: a message to insert into the error capture header
* @msg: a message to insert into the error capture header
*
* Should be called when an error is detected (either a hang or an error
* interrupt) to capture error state from the time of the error. Fills
......@@ -1929,8 +1924,8 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
* to pick up.
*/
void i915_capture_error_state(struct drm_i915_private *i915,
u32 engine_mask,
const char *error_msg)
unsigned long engine_mask,
const char *msg)
{
static bool warned;
struct i915_gpu_state *error;
......@@ -1946,8 +1941,7 @@ void i915_capture_error_state(struct drm_i915_private *i915,
if (IS_ERR(error))
return;
i915_error_capture_msg(i915, error, engine_mask, error_msg);
DRM_INFO("%s\n", error->error_msg);
dev_info(i915->drm.dev, "%s\n", error_msg(error, engine_mask, msg));
if (!error->simulated) {
spin_lock_irqsave(&i915->gpu_error.lock, flags);
......
......@@ -85,8 +85,6 @@ struct i915_gpu_state {
bool waiting;
int num_waiters;
unsigned long hangcheck_timestamp;
bool hangcheck_stalled;
enum intel_engine_hangcheck_action hangcheck_action;
struct i915_address_space *vm;
int num_requests;
u32 reset_count;
......@@ -197,6 +195,8 @@ struct i915_gpu_state {
struct scatterlist *sgl, *fit;
};
struct i915_gpu_restart;
struct i915_gpu_error {
/* For hangcheck timer */
#define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
......@@ -247,15 +247,6 @@ struct i915_gpu_error {
* i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
* secondary role in preventing two concurrent global reset attempts.
*
* #I915_RESET_HANDOFF - To perform the actual GPU reset, we need the
* struct_mutex. We try to acquire the struct_mutex in the reset worker,
* but it may be held by some long running waiter (that we cannot
* interrupt without causing trouble). Once we are ready to do the GPU
* reset, we set the I915_RESET_HANDOFF bit and wakeup any waiters. If
* they already hold the struct_mutex and want to participate they can
* inspect the bit and do the reset directly, otherwise the worker
* waits for the struct_mutex.
*
* #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to
* acquire the struct_mutex to reset an engine, we need an explicit
* flag to prevent two concurrent reset attempts in the same engine.
......@@ -269,20 +260,13 @@ struct i915_gpu_error {
*/
unsigned long flags;
#define I915_RESET_BACKOFF 0
#define I915_RESET_HANDOFF 1
#define I915_RESET_MODESET 2
#define I915_RESET_ENGINE 3
#define I915_RESET_MODESET 1
#define I915_RESET_ENGINE 2
#define I915_WEDGED (BITS_PER_LONG - 1)
/** Number of times an engine has been reset */
u32 reset_engine_count[I915_NUM_ENGINES];
/** Set of stalled engines with guilty requests, in the current reset */
u32 stalled_mask;
/** Reason for the current *global* reset */
const char *reason;
struct mutex wedge_mutex; /* serialises wedging/unwedging */
/**
......@@ -299,6 +283,8 @@ struct i915_gpu_error {
/* For missed irq/seqno simulation. */
unsigned long test_irq_rings;
struct i915_gpu_restart *restart;
};
struct drm_i915_error_state_buf {
......@@ -320,7 +306,7 @@ void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
struct i915_gpu_state *i915_capture_gpu_state(struct drm_i915_private *i915);
void i915_capture_error_state(struct drm_i915_private *dev_priv,
u32 engine_mask,
unsigned long engine_mask,
const char *error_msg);
static inline struct i915_gpu_state *
......
......@@ -1083,18 +1083,6 @@ static bool __i915_spin_request(const struct i915_request *rq,
return false;
}
static bool __i915_wait_request_check_and_reset(struct i915_request *request)
{
struct i915_gpu_error *error = &request->i915->gpu_error;
if (likely(!i915_reset_handoff(error)))
return false;
__set_current_state(TASK_RUNNING);
i915_reset(request->i915, error->stalled_mask, error->reason);
return true;
}
/**
* i915_request_wait - wait until execution of request has finished
* @rq: the request to wait upon
......@@ -1120,17 +1108,10 @@ long i915_request_wait(struct i915_request *rq,
{
const int state = flags & I915_WAIT_INTERRUPTIBLE ?
TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
wait_queue_head_t *errq = &rq->i915->gpu_error.wait_queue;
DEFINE_WAIT_FUNC(reset, default_wake_function);
DEFINE_WAIT_FUNC(exec, default_wake_function);
struct intel_wait wait;
might_sleep();
#if IS_ENABLED(CONFIG_LOCKDEP)
GEM_BUG_ON(debug_locks &&
!!lockdep_is_held(&rq->i915->drm.struct_mutex) !=
!!(flags & I915_WAIT_LOCKED));
#endif
GEM_BUG_ON(timeout < 0);
if (i915_request_completed(rq))
......@@ -1140,11 +1121,7 @@ long i915_request_wait(struct i915_request *rq,
return -ETIME;
trace_i915_request_wait_begin(rq, flags);
add_wait_queue(&rq->execute, &exec);
if (flags & I915_WAIT_LOCKED)
add_wait_queue(errq, &reset);
intel_wait_init(&wait);
if (flags & I915_WAIT_PRIORITY)
i915_schedule_bump_priority(rq, I915_PRIORITY_WAIT);
......@@ -1155,10 +1132,6 @@ long i915_request_wait(struct i915_request *rq,
if (intel_wait_update_request(&wait, rq))
break;
if (flags & I915_WAIT_LOCKED &&
__i915_wait_request_check_and_reset(rq))
continue;
if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
goto complete;
......@@ -1188,9 +1161,6 @@ long i915_request_wait(struct i915_request *rq,
*/
goto wakeup;
if (flags & I915_WAIT_LOCKED)
__i915_wait_request_check_and_reset(rq);
for (;;) {
if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
......@@ -1214,21 +1184,6 @@ long i915_request_wait(struct i915_request *rq,
if (i915_request_completed(rq))
break;
/*
* If the GPU is hung, and we hold the lock, reset the GPU
* and then check for completion. On a full reset, the engine's
* HW seqno will be advanced passed us and we are complete.
* If we do a partial reset, we have to wait for the GPU to
* resume and update the breadcrumb.
*
* If we don't hold the mutex, we can just wait for the worker
* to come along and update the breadcrumb (either directly
* itself, or indirectly by recovering the GPU).
*/
if (flags & I915_WAIT_LOCKED &&
__i915_wait_request_check_and_reset(rq))
continue;
/* Only spin if we know the GPU is processing this request */
if (__i915_spin_request(rq, wait.seqno, state, 2))
break;
......@@ -1242,8 +1197,6 @@ long i915_request_wait(struct i915_request *rq,
intel_engine_remove_wait(rq->engine, &wait);
complete:
__set_current_state(TASK_RUNNING);
if (flags & I915_WAIT_LOCKED)
remove_wait_queue(errq, &reset);
remove_wait_queue(&rq->execute, &exec);
trace_i915_request_wait_end(rq);
......
This diff is collapsed.
......@@ -29,6 +29,9 @@ void i915_reset(struct drm_i915_private *i915,
int i915_reset_engine(struct intel_engine_cs *engine,
const char *reason);
void i915_reset_request(struct i915_request *rq, bool guilty);
bool i915_reset_flush(struct drm_i915_private *i915);
bool intel_has_gpu_reset(struct drm_i915_private *i915);
bool intel_has_reset_engine(struct drm_i915_private *i915);
......
......@@ -1119,10 +1119,8 @@ void intel_engines_sanitize(struct drm_i915_private *i915, bool force)
if (!reset_engines(i915) && !force)
return;
for_each_engine(engine, i915, id) {
if (engine->reset.reset)
engine->reset.reset(engine, NULL);
}
for_each_engine(engine, i915, id)
intel_engine_reset(engine, false);
}
/**
......
......@@ -834,8 +834,7 @@ static void guc_submission_tasklet(unsigned long data)
spin_unlock_irqrestore(&engine->timeline.lock, flags);
}
static struct i915_request *
guc_reset_prepare(struct intel_engine_cs *engine)
static void guc_reset_prepare(struct intel_engine_cs *engine)
{
struct intel_engine_execlists * const execlists = &engine->execlists;
......@@ -861,8 +860,6 @@ guc_reset_prepare(struct intel_engine_cs *engine)
*/
if (engine->i915->guc.preempt_wq)
flush_workqueue(engine->i915->guc.preempt_wq);
return i915_gem_find_active_request(engine);
}
/*
......
......@@ -25,6 +25,17 @@
#include "i915_drv.h"
#include "i915_reset.h"
struct hangcheck {
u64 acthd;
u32 seqno;
enum intel_engine_hangcheck_action action;
unsigned long action_timestamp;
int deadlock;
struct intel_instdone instdone;
bool wedged:1;
bool stalled:1;
};
static bool instdone_unchanged(u32 current_instdone, u32 *old_instdone)
{
u32 tmp = current_instdone | *old_instdone;
......@@ -119,25 +130,22 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
}
static void hangcheck_load_sample(struct intel_engine_cs *engine,
struct intel_engine_hangcheck *hc)
struct hangcheck *hc)
{
hc->acthd = intel_engine_get_active_head(engine);
hc->seqno = intel_engine_get_seqno(engine);
}
static void hangcheck_store_sample(struct intel_engine_cs *engine,
const struct intel_engine_hangcheck *hc)
const struct hangcheck *hc)
{
engine->hangcheck.acthd = hc->acthd;
engine->hangcheck.seqno = hc->seqno;
engine->hangcheck.action = hc->action;
engine->hangcheck.stalled = hc->stalled;
engine->hangcheck.wedged = hc->wedged;
}
static enum intel_engine_hangcheck_action
hangcheck_get_action(struct intel_engine_cs *engine,
const struct intel_engine_hangcheck *hc)
const struct hangcheck *hc)
{
if (engine->hangcheck.seqno != hc->seqno)
return ENGINE_ACTIVE_SEQNO;
......@@ -149,7 +157,7 @@ hangcheck_get_action(struct intel_engine_cs *engine,
}
static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
struct intel_engine_hangcheck *hc)
struct hangcheck *hc)
{
unsigned long timeout = I915_ENGINE_DEAD_TIMEOUT;
......@@ -265,19 +273,19 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
for_each_engine(engine, dev_priv, id) {
struct intel_engine_hangcheck hc;
struct hangcheck hc;
hangcheck_load_sample(engine, &hc);
hangcheck_accumulate_sample(engine, &hc);
hangcheck_store_sample(engine, &hc);
if (engine->hangcheck.stalled) {
if (hc.stalled) {
hung |= intel_engine_flag(engine);
if (hc.action != ENGINE_DEAD)
stuck |= intel_engine_flag(engine);
}
if (engine->hangcheck.wedged)
if (hc.wedged)
wedged |= intel_engine_flag(engine);
}
......
......@@ -136,6 +136,7 @@
#include <drm/i915_drm.h>
#include "i915_drv.h"
#include "i915_gem_render_state.h"
#include "i915_reset.h"
#include "i915_vgpu.h"
#include "intel_lrc_reg.h"
#include "intel_mocs.h"
......@@ -264,7 +265,8 @@ static void unwind_wa_tail(struct i915_request *rq)
assert_ring_tail_valid(rq->ring, rq->tail);
}
static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
static struct i915_request *
__unwind_incomplete_requests(struct intel_engine_cs *engine)
{
struct i915_request *rq, *rn, *active = NULL;
struct list_head *uninitialized_var(pl);
......@@ -306,6 +308,8 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
list_move_tail(&active->sched.link,
i915_sched_lookup_priolist(engine, prio));
}
return active;
}
void
......@@ -1732,11 +1736,9 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
return 0;
}
static struct i915_request *
execlists_reset_prepare(struct intel_engine_cs *engine)
static void execlists_reset_prepare(struct intel_engine_cs *engine)
{
struct intel_engine_execlists * const execlists = &engine->execlists;
struct i915_request *request, *active;
unsigned long flags;
GEM_TRACE("%s: depth<-%d\n", engine->name,
......@@ -1752,59 +1754,21 @@ execlists_reset_prepare(struct intel_engine_cs *engine)
* prevents the race.
*/
__tasklet_disable_sync_once(&execlists->tasklet);
GEM_BUG_ON(!reset_in_progress(execlists));
/* And flush any current direct submission. */
spin_lock_irqsave(&engine->timeline.lock, flags);
/*
* We want to flush the pending context switches, having disabled
* the tasklet above, we can assume exclusive access to the execlists.
* For this allows us to catch up with an inflight preemption event,
* and avoid blaming an innocent request if the stall was due to the
* preemption itself.
*/
process_csb(engine);
/*
* The last active request can then be no later than the last request
* now in ELSP[0]. So search backwards from there, so that if the GPU
* has advanced beyond the last CSB update, it will be pardoned.
*/
active = NULL;
request = port_request(execlists->port);
if (request) {
/*
* Prevent the breadcrumb from advancing before we decide
* which request is currently active.
*/
intel_engine_stop_cs(engine);
list_for_each_entry_from_reverse(request,
&engine->timeline.requests,
link) {
if (__i915_request_completed(request,
request->global_seqno))
break;
active = request;
}
}
process_csb(engine); /* drain preemption events */
spin_unlock_irqrestore(&engine->timeline.lock, flags);
return active;
}
static void execlists_reset(struct intel_engine_cs *engine,
struct i915_request *request)
static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
{
struct intel_engine_execlists * const execlists = &engine->execlists;
struct i915_request *rq;
unsigned long flags;
u32 *regs;
GEM_TRACE("%s request global=%d, current=%d\n",
engine->name, request ? request->global_seqno : 0,
intel_engine_get_seqno(engine));
spin_lock_irqsave(&engine->timeline.lock, flags);
/*
......@@ -1819,12 +1783,18 @@ static void execlists_reset(struct intel_engine_cs *engine,
execlists_cancel_port_requests(execlists);
/* Push back any incomplete requests for replay after the reset. */
__unwind_incomplete_requests(engine);
rq = __unwind_incomplete_requests(engine);
/* Following the reset, we need to reload the CSB read/write pointers */
reset_csb_pointers(&engine->execlists);
spin_unlock_irqrestore(&engine->timeline.lock, flags);
GEM_TRACE("%s seqno=%d, current=%d, stalled? %s\n",
engine->name,
rq ? rq->global_seqno : 0,
intel_engine_get_seqno(engine),
yesno(stalled));
if (!rq)
goto out_unlock;
/*
* If the request was innocent, we leave the request in the ELSP
......@@ -1837,8 +1807,9 @@ static void execlists_reset(struct intel_engine_cs *engine,
* and have to at least restore the RING register in the context
* image back to the expected values to skip over the guilty request.
*/
if (!request || request->fence.error != -EIO)
return;
i915_reset_request(rq, stalled);
if (!stalled)
goto out_unlock;
/*
* We want a simple context + ring to execute the breadcrumb update.
......@@ -1848,7 +1819,7 @@ static void execlists_reset(struct intel_engine_cs *engine,
* future request will be after userspace has had the opportunity
* to recreate its own state.
*/
regs = request->hw_context->lrc_reg_state;
regs = rq->hw_context->lrc_reg_state;
if (engine->pinned_default_state) {
memcpy(regs, /* skip restoring the vanilla PPHWSP */
engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
......@@ -1856,17 +1827,14 @@ static void execlists_reset(struct intel_engine_cs *engine,
}
/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
request->ring->head = intel_ring_wrap(request->ring, request->postfix);
rq->ring->head = intel_ring_wrap(rq->ring, rq->postfix);
intel_ring_update_space(rq->ring);
execlists_init_reg_state(regs, request->gem_context, engine,
request->ring);
execlists_init_reg_state(regs, rq->gem_context, engine, rq->ring);
__execlists_update_reg_state(engine, rq->hw_context);
__execlists_update_reg_state(engine, request->hw_context);
intel_ring_update_space(request->ring);
/* Reset WaIdleLiteRestore:bdw,skl as well */
unwind_wa_tail(request);
out_unlock:
spin_unlock_irqrestore(&engine->timeline.lock, flags);
}
static void execlists_reset_finish(struct intel_engine_cs *engine)
......@@ -1879,6 +1847,7 @@ static void execlists_reset_finish(struct intel_engine_cs *engine)
* to sleep before we restart and reload a context.
*
*/
GEM_BUG_ON(!reset_in_progress(execlists));
if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
execlists->tasklet.func(execlists->tasklet.data);
......
......@@ -478,8 +478,6 @@ void intel_overlay_reset(struct drm_i915_private *dev_priv)
if (!overlay)
return;
intel_overlay_release_old_vid(overlay);
overlay->old_xscale = 0;
overlay->old_yscale = 0;
overlay->crtc = NULL;
......
......@@ -33,6 +33,7 @@
#include "i915_drv.h"
#include "i915_gem_render_state.h"
#include "i915_reset.h"
#include "i915_trace.h"
#include "intel_drv.h"
#include "intel_workarounds.h"
......@@ -711,52 +712,80 @@ static int init_ring_common(struct intel_engine_cs *engine)
return ret;
}
static struct i915_request *reset_prepare(struct intel_engine_cs *engine)
static void reset_prepare(struct intel_engine_cs *engine)
{
intel_engine_stop_cs(engine);
return i915_gem_find_active_request(engine);
}
static void skip_request(struct i915_request *rq)
static void reset_ring(struct intel_engine_cs *engine, bool stalled)
{
void *vaddr = rq->ring->vaddr;
struct i915_timeline *tl = &engine->timeline;
struct i915_request *pos, *rq;
unsigned long flags;
u32 head;
head = rq->infix;
if (rq->postfix < head) {
memset32(vaddr + head, MI_NOOP,
(rq->ring->size - head) / sizeof(u32));
head = 0;
rq = NULL;
spin_lock_irqsave(&tl->lock, flags);
list_for_each_entry(pos, &tl->requests, link) {
if (!__i915_request_completed(pos, pos->global_seqno)) {
rq = pos;
break;
}
}
memset32(vaddr + head, MI_NOOP, (rq->postfix - head) / sizeof(u32));
}
static void reset_ring(struct intel_engine_cs *engine, struct i915_request *rq)
{
GEM_TRACE("%s request global=%d, current=%d\n",
engine->name, rq ? rq->global_seqno : 0,
intel_engine_get_seqno(engine));
GEM_TRACE("%s seqno=%d, current=%d, stalled? %s\n",
engine->name,
rq ? rq->global_seqno : 0,
intel_engine_get_seqno(engine),
yesno(stalled));
/*
* Try to restore the logical GPU state to match the continuation
* of the request queue. If we skip the context/PD restore, then
* the next request may try to execute assuming that its context
* is valid and loaded on the GPU and so may try to access invalid
* memory, prompting repeated GPU hangs.
* The guilty request will get skipped on a hung engine.
*
* If the request was guilty, we still restore the logical state
* in case the next request requires it (e.g. the aliasing ppgtt),
* but skip over the hung batch.
* Users of client default contexts do not rely on logical
* state preserved between batches so it is safe to execute
* queued requests following the hang. Non default contexts
* rely on preserved state, so skipping a batch loses the
* evolution of the state and it needs to be considered corrupted.
* Executing more queued batches on top of corrupted state is
* risky. But we take the risk by trying to advance through
* the queued requests in order to make the client behaviour
* more predictable around resets, by not throwing away random
* amount of batches it has prepared for execution. Sophisticated
* clients can use gem_reset_stats_ioctl and dma fence status
* (exported via sync_file info ioctl on explicit fences) to observe
* when it loses the context state and should rebuild accordingly.
*
* If the request was innocent, we try to replay the request with
* the restored context.
* The context ban, and ultimately the client ban, mechanism are safety
* valves if client submission ends up resulting in nothing more than
* subsequent hangs.
*/
if (rq) {
/* If the rq hung, jump to its breadcrumb and skip the batch */
rq->ring->head = intel_ring_wrap(rq->ring, rq->head);
if (rq->fence.error == -EIO)
skip_request(rq);
/*
* Try to restore the logical GPU state to match the
* continuation of the request queue. If we skip the
* context/PD restore, then the next request may try to execute
* assuming that its context is valid and loaded on the GPU and
* so may try to access invalid memory, prompting repeated GPU
* hangs.
*
* If the request was guilty, we still restore the logical
* state in case the next request requires it (e.g. the
* aliasing ppgtt), but skip over the hung batch.
*
* If the request was innocent, we try to replay the request
* with the restored context.
*/
i915_reset_request(rq, stalled);
GEM_BUG_ON(rq->ring != engine->buffer);
head = rq->head;
} else {
head = engine->buffer->tail;
}
engine->buffer->head = intel_ring_wrap(engine->buffer, head);
spin_unlock_irqrestore(&tl->lock, flags);
}
static void reset_finish(struct intel_engine_cs *engine)
......
......@@ -120,13 +120,8 @@ struct intel_instdone {
struct intel_engine_hangcheck {
u64 acthd;
u32 seqno;
enum intel_engine_hangcheck_action action;
unsigned long action_timestamp;
int deadlock;
struct intel_instdone instdone;
struct i915_request *active_request;
bool stalled:1;
bool wedged:1;
};
struct intel_ring {
......@@ -444,9 +439,8 @@ struct intel_engine_cs {
int (*init_hw)(struct intel_engine_cs *engine);
struct {
struct i915_request *(*prepare)(struct intel_engine_cs *engine);
void (*reset)(struct intel_engine_cs *engine,
struct i915_request *rq);
void (*prepare)(struct intel_engine_cs *engine);
void (*reset)(struct intel_engine_cs *engine, bool stalled);
void (*finish)(struct intel_engine_cs *engine);
} reset;
......@@ -1018,6 +1012,13 @@ gen8_emit_ggtt_write(u32 *cs, u32 value, u32 gtt_offset)
return cs;
}
static inline void intel_engine_reset(struct intel_engine_cs *engine,
bool stalled)
{
if (engine->reset.reset)
engine->reset.reset(engine, stalled);
}
void intel_engines_sanitize(struct drm_i915_private *i915, bool force);
bool intel_engine_is_idle(struct intel_engine_cs *engine);
......
......@@ -363,9 +363,7 @@ static int igt_global_reset(void *arg)
/* Check that we can issue a global GPU reset */
igt_global_reset_lock(i915);
set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
mutex_lock(&i915->drm.struct_mutex);
reset_count = i915_reset_count(&i915->gpu_error);
i915_reset(i915, ALL_ENGINES, NULL);
......@@ -374,9 +372,7 @@ static int igt_global_reset(void *arg)
pr_err("No GPU reset recorded!\n");
err = -EINVAL;
}
mutex_unlock(&i915->drm.struct_mutex);
GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
igt_global_reset_unlock(i915);
if (i915_terminally_wedged(&i915->gpu_error))
......@@ -399,9 +395,7 @@ static int igt_wedged_reset(void *arg)
i915_gem_set_wedged(i915);
GEM_BUG_ON(!i915_terminally_wedged(&i915->gpu_error));
set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
i915_reset(i915, ALL_ENGINES, NULL);
GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
intel_runtime_pm_put(i915, wakeref);
mutex_unlock(&i915->drm.struct_mutex);
......@@ -511,7 +505,7 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
break;
}
if (!wait_for_idle(engine)) {
if (!i915_reset_flush(i915)) {
struct drm_printer p =
drm_info_printer(i915->drm.dev);
......@@ -903,20 +897,13 @@ static int igt_reset_engines(void *arg)
return 0;
}
static u32 fake_hangcheck(struct i915_request *rq, u32 mask)
static u32 fake_hangcheck(struct drm_i915_private *i915, u32 mask)
{
struct i915_gpu_error *error = &rq->i915->gpu_error;
u32 reset_count = i915_reset_count(error);
error->stalled_mask = mask;
/* set_bit() must be after we have setup the backchannel (mask) */
smp_mb__before_atomic();
set_bit(I915_RESET_HANDOFF, &error->flags);
u32 count = i915_reset_count(&i915->gpu_error);
wake_up_all(&error->wait_queue);
i915_reset(i915, mask, NULL);
return reset_count;
return count;
}
static int igt_reset_wait(void *arg)
......@@ -962,7 +949,7 @@ static int igt_reset_wait(void *arg)
goto out_rq;
}
reset_count = fake_hangcheck(rq, ALL_ENGINES);
reset_count = fake_hangcheck(i915, ALL_ENGINES);
timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
if (timeout < 0) {
......@@ -972,7 +959,6 @@ static int igt_reset_wait(void *arg)
goto out_rq;
}
GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
if (i915_reset_count(&i915->gpu_error) == reset_count) {
pr_err("No GPU reset recorded!\n");
err = -EINVAL;
......@@ -1162,7 +1148,7 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915,
}
out_reset:
fake_hangcheck(rq, intel_engine_flag(rq->engine));
fake_hangcheck(rq->i915, intel_engine_flag(rq->engine));
if (tsk) {
struct igt_wedge_me w;
......@@ -1341,12 +1327,7 @@ static int igt_reset_queue(void *arg)
goto fini;
}
reset_count = fake_hangcheck(prev, ENGINE_MASK(id));
i915_reset(i915, ENGINE_MASK(id), NULL);
GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
&i915->gpu_error.flags));
reset_count = fake_hangcheck(i915, ENGINE_MASK(id));
if (prev->fence.error != -EIO) {
pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
......@@ -1565,6 +1546,7 @@ static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
pr_err("%s(%s): Failed to start request %llx, at %x\n",
__func__, engine->name,
rq->fence.seqno, hws_seqno(&h, rq));
i915_gem_set_wedged(i915);
err = -EIO;
}
......@@ -1588,7 +1570,6 @@ static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
static void force_reset(struct drm_i915_private *i915)
{
i915_gem_set_wedged(i915);
set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
i915_reset(i915, 0, NULL);
}
......@@ -1618,6 +1599,26 @@ static int igt_atomic_reset(void *arg)
if (i915_terminally_wedged(&i915->gpu_error))
goto unlock;
if (intel_has_gpu_reset(i915)) {
const typeof(*phases) *p;
for (p = phases; p->name; p++) {
GEM_TRACE("intel_gpu_reset under %s\n", p->name);
p->critical_section_begin();
err = intel_gpu_reset(i915, ALL_ENGINES);
p->critical_section_end();
if (err) {
pr_err("intel_gpu_reset failed under %s\n",
p->name);
goto out;
}
}
force_reset(i915);
}
if (intel_has_reset_engine(i915)) {
struct intel_engine_cs *engine;
enum intel_engine_id id;
......
......@@ -214,7 +214,6 @@ static int check_whitelist(struct i915_gem_context *ctx,
static int do_device_reset(struct intel_engine_cs *engine)
{
set_bit(I915_RESET_HANDOFF, &engine->i915->gpu_error.flags);
i915_reset(engine->i915, ENGINE_MASK(engine->id), "live_workarounds");
return 0;
}
......@@ -394,7 +393,6 @@ static int
live_gpu_reset_gt_engine_workarounds(void *arg)
{
struct drm_i915_private *i915 = arg;
struct i915_gpu_error *error = &i915->gpu_error;
intel_wakeref_t wakeref;
struct wa_lists lists;
bool ok;
......@@ -413,7 +411,6 @@ live_gpu_reset_gt_engine_workarounds(void *arg)
if (!ok)
goto out;
set_bit(I915_RESET_HANDOFF, &error->flags);
i915_reset(i915, ALL_ENGINES, "live_workarounds");
ok = verify_gt_engine_wa(i915, &lists, "after reset");
......
......@@ -58,8 +58,8 @@ static void mock_device_release(struct drm_device *dev)
i915_gem_contexts_lost(i915);
mutex_unlock(&i915->drm.struct_mutex);
cancel_delayed_work_sync(&i915->gt.retire_work);
cancel_delayed_work_sync(&i915->gt.idle_work);
drain_delayed_work(&i915->gt.retire_work);
drain_delayed_work(&i915->gt.idle_work);
i915_gem_drain_workqueue(i915);
mutex_lock(&i915->drm.struct_mutex);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment